diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp index e8714c128f745d7949ce0897a57475bd070de4a5..8ccd087d49fc2171ebb95dbd723ed8e4723736db 100644 --- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -918,7 +918,7 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(7)); // Create wrapper API runtime function call - Constant* wrapper_tensorGroupConvolution; + Constant* wrapper_tensorGroupConvolution = M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"), RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution, @@ -956,9 +956,9 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(3)); Args.push_back(TensorII->getOperand(4)); Args.push_back(TensorII->getOperand(5)); - + // Create wrapper API runtime function call - Constant* wrapper_tensorBatchNorm; + Constant* wrapper_tensorBatchNorm = M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"), RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm, diff --git a/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py new file mode 100644 index 0000000000000000000000000000000000000000..0b7f09d92e91894d284b40cc0bd2d346c08e36c7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/extractQuantRange.py @@ -0,0 +1,42 @@ + + +import sys + + +if __name__ == "__main__": + + f = open(sys.argv[1], "r") + f2 = open("quant_ranges.txt", "w+") + + layer_line = False + for x in f: + if "ConvLayer_PROMISE" in x or "FCLayer_PROMISE" in x or layer_line == True: + if layer_line == True: + layer_line = False + else: + layer_line = True + + print x + toks = x.split(",") + + for tok in toks: + tok = tok.strip() + tok_val = "" + try: + tok_val = float(tok) + try: + tok_val = int(tok) + except: + print (tok_val) + f2.write(str(tok_val) + " ") + #f2.write("tok_val = ", tok_val + " ") + except: + continue + + f2.write("\n") + + + f.close() + f2.close() + + diff --git a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh index cde03bd6d0ffa9969c785e17fe2f708c75396158..33a54cd0de626113e5cf11e2f6a6928d4fa384eb 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh +++ b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh @@ -3,11 +3,9 @@ export HPVM_TENSOR_RT_HOME=/home/hsharif3/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/ export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH -clang++ -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc +clang++ -I/software/cuda-9.1/include -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc llvm-dis --version llvm-dis ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc -cp ${HPVM_TENSOR_RT_HOME}/build/libtensor_runtime.a ${HPVM_TENSOR_RT_HOME}/lib/libtensor_runtime.a -cp ${HPVM_TENSOR_RT_HOME}/build_autotuner/libtensor_runtime.a ${HPVM_TENSOR_RT_HOME}/lib/libtensor_autotuner.a diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py index eeca0ed8ed8ed407b9c84592b22820857678b311..7e969271c20031dab9f302b333a4f7feb0338871 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py @@ -7,9 +7,15 @@ # Batch 12: Error Sens: 10, 25, 35, for Loss1, 2, 3, respectively, Min: P3. 1000 Runs for All # Batch 13: No Error Sens: Equal Runs (1000) for all. Min: P1 # Batch 14: Reruning Batch12 with bugFix! -# Batch 16: MAJOR CHANGE: 3 different skip levels for each Loss1,Loss2,Loss3 -# Batch 17: Baseline with 3000 runs. Compare with Batch16 +# Batch 15: MAJOR CHANGE: 3 different skip levels for each Loss1,Loss2,Loss3 +# Batch 18: Batch13 (Basline) + ParetoCurve (1500 Runs) - BUGGY IGNORE!!! + +# Batch 19: (Basline) + ParetoCurve + 2 runs in Tuning Phase (1500 Runs) +# Batch 20: 3-Skip levels + + 2 runs + 1500 Runs + EnergyBandSize now % of Max (Compare against Batch19 + + +batch_id = "batch201" class Benchmark: def __init__(self): @@ -46,21 +52,23 @@ Alexnet1.skip_layers = 0 Alexnet1.skip_layer_str = "5_0" Alexnet1.base_dir = "../build_tuner/tuner_results/alexnet_cifar10/" -Alexnet1.result_dir_1 = "../build_tuner/tuner_results/alexnet_cifar10/loss_1/batch17" -Alexnet1.result_dir_2 = "../build_tuner/tuner_results/alexnet_cifar10/loss_2/batch17" -Alexnet1.result_dir_3 = "../build_tuner/tuner_results/alexnet_cifar10/loss_3/batch17" +Alexnet1.result_dir_1 = "../build_tuner/tuner_results/alexnet_cifar10/loss_1/" + batch_id +Alexnet1.result_dir_2 = "../build_tuner/tuner_results/alexnet_cifar10/loss_2/" + batch_id +Alexnet1.result_dir_3 = "../build_tuner/tuner_results/alexnet_cifar10/loss_3/" + batch_id Alexnet1.tensor_desc_file = "tuner_results/alexnet_cifar10/alexnet_tensors.txt" Alexnet1.layer_file = "tuner_results/alexnet_cifar10/alexnet_layers.txt" Alexnet1.cost_file = "../build_tuner/tuner_results/alexnet_cifar10/op_cost.txt" +Alexnet1.layer_knobs = "../opentuner/data/alexnet/knobs.txt" #Alexnet1.loss1_result_file = "tuner_results/alexnet2_cifar10/alexnet_layers.txt" Alexnet1.loss1_result_file = "tuner_results/alexnet_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet1.loss2_result_file = "tuner_results/alexnet_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet1.autotuner_runs = 1000 +Alexnet1.autotuner_runs = 1500 Alexnet1.tuner_accuracy = 79.9 -Alexnet1.promise_accuracy = 79.9 +#Alexnet1.promise_accuracy = 79.9 +Alexnet1.promise_accuracy = 79.5 Alexnet1.validation_accuracy = 79.19 bench_tuner_data["alexnet_cifar10"] = Alexnet1 @@ -79,17 +87,19 @@ Alexnet2.start_promise_range = 1 Alexnet2.skip_layer_str = "6_1_0" Alexnet2.base_dir = "../build_tuner/tuner_results/alexnet2_cifar10/" -Alexnet2.result_dir_1 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_1/batch17" -Alexnet2.result_dir_2 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_2/batch17" -Alexnet2.result_dir_3 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_3/batch17" +Alexnet2.result_dir_1 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_1/" + batch_id +Alexnet2.result_dir_2 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_2/" + batch_id +Alexnet2.result_dir_3 = "../build_tuner/tuner_results/alexnet2_cifar10/loss_3/" + batch_id Alexnet2.tensor_desc_file = "tuner_results/alexnet2_cifar10/alexnet2_tensors.txt" Alexnet2.layer_file = "tuner_results/alexnet2_cifar10/alexnet2_layers.txt" Alexnet2.cost_file = "../build_tuner/tuner_results/alexnet2_cifar10/op_cost.txt" +Alexnet2.layer_knobs = "../opentuner/data/alexnet2/knobs.txt" #Alexnet2.loss1_result_file = "tuner_results/alexnet2_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" #Alexnet2.loss2_result_file = "tuner_results/alexnet2_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet2.autotuner_runs = 1000 +Alexnet2.autotuner_runs = 1500 Alexnet2.tuner_accuracy = 84.19 -Alexnet2.promise_accuracy = 84.19 +#Alexnet2.promise_accuracy = 84.19 +Alexnet2.promise_accuracy = 84.8 Alexnet2.validation_accuracy = 85.15 bench_tuner_data["alexnet2_cifar10"] = Alexnet2 @@ -109,20 +119,22 @@ Alexnet3.start_promise_range = 1 Alexnet3.skip_layer_str = "14_3_4_1_6" Alexnet3.base_dir = "../build_tuner/tuner_results/vgg16_cifar10/" -Alexnet3.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar10/loss_1/batch17" -Alexnet3.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar10/loss_2/batch17" -Alexnet3.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar10/loss_3/batch17" +Alexnet3.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar10/loss_1/" + batch_id +Alexnet3.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar10/loss_2/" + batch_id +Alexnet3.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar10/loss_3/" + batch_id Alexnet3.tensor_desc_file = "tuner_results/vgg16_cifar10/vgg16_tensors.txt" Alexnet3.layer_file = "tuner_results/vgg16_cifar10/vgg16_layers.txt" Alexnet3.cost_file = "../build_tuner/tuner_results/vgg16_cifar10/op_cost.txt" +Alexnet3.layer_knobs = "../opentuner/data/vgg16_cifar10/knobs.txt" Alexnet3.loss1_result_file = "tuner_results/vgg16_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet3.loss2_result_file = "tuner_results/vgg16_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet3.autotuner_runs = 1000 +Alexnet3.autotuner_runs = 1500 Alexnet3.tuner_accuracy = 90.19 -Alexnet3.promise_accuracy = 90.19 +#Alexnet3.promise_accuracy = 90.19 +Alexnet3.promise_accuracy = 89.55 Alexnet3.validation_accuracy = 89.05 bench_tuner_data["vgg16_cifar10"] = Alexnet3 @@ -141,19 +153,21 @@ Alexnet4.start_promise_range = 1 #Alexnet4.skip_layer_str = "0" Alexnet4.skip_layer_str = "0_1_2_14_15_17_18_21" Alexnet4.base_dir = "../build_tuner/tuner_results/resnet18_cifar10/" -Alexnet4.result_dir_1 = "../build_tuner/tuner_results/resnet18_cifar10/loss_1/batch17" -Alexnet4.result_dir_2 = "../build_tuner/tuner_results/resnet18_cifar10/loss_2/batch17" -Alexnet4.result_dir_3 = "../build_tuner/tuner_results/resnet18_cifar10/loss_3/batch17" +Alexnet4.result_dir_1 = "../build_tuner/tuner_results/resnet18_cifar10/loss_1/" + batch_id +Alexnet4.result_dir_2 = "../build_tuner/tuner_results/resnet18_cifar10/loss_2/" + batch_id +Alexnet4.result_dir_3 = "../build_tuner/tuner_results/resnet18_cifar10/loss_3/" + batch_id Alexnet4.tensor_desc_file = "tuner_results/resnet18_cifar10/resnet_tensors.txt" -Alexnet4.layer_file = "tuner_results/resnet18_cifar10/resnet18_layers.txt" +Alexnet4.layer_file = "tuner_results/resnet18_cifar10/resnet_layers.txt" Alexnet4.cost_file = "../build_tuner/tuner_results/resnet18_cifar10/op_cost.txt" +Alexnet4.layer_knobs = "../opentuner/data/resnet/knobs.txt" Alexnet4.loss1_result_file = "tuner_results/resnet18_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet4.loss2_result_file = "tuner_results/resnet18_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet4.autotuner_runs = 1000 +Alexnet4.autotuner_runs = 1500 Alexnet4.tuner_accuracy = 89.6 -Alexnet4.promise_accuracy = 89.59 +#Alexnet4.promise_accuracy = 89.59 - 1000 images +Alexnet4.promise_accuracy = 89.94 Alexnet4.validation_accuracy = 89.65 bench_tuner_data["resnet18_cifar10"] = Alexnet4 @@ -174,19 +188,21 @@ Alexnet5.start_promise_range = 1 #Alexnet5.skip_layer_str = "0" Alexnet5.skip_layer_str = "0_1_2_3_4" Alexnet5.base_dir = "../build_tuner/tuner_results/vgg16_cifar100/" -Alexnet5.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar100/loss_1/batch17" -Alexnet5.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar100/loss_2/batch17" -Alexnet5.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar100/loss_3/batch17" +Alexnet5.result_dir_1 = "../build_tuner/tuner_results/vgg16_cifar100/loss_1/" + batch_id +Alexnet5.result_dir_2 = "../build_tuner/tuner_results/vgg16_cifar100/loss_2/" + batch_id +Alexnet5.result_dir_3 = "../build_tuner/tuner_results/vgg16_cifar100/loss_3/" + batch_id Alexnet5.tensor_desc_file = "../build_tuner/tuner_results/vgg16_cifar100/vgg16_tensors.txt" Alexnet5.layer_file = "../build_tuner/tuner_results/vgg16_cifar100/vgg16_layers.txt" Alexnet5.cost_file = "../build_tuner/tuner_results/vgg16_cifar100/op_cost.txt" +Alexnet5.layer_knobs = "../opentuner/data/vgg16_cifar100/knobs.txt" Alexnet5.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet5.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet5.autotuner_runs = 1000 +Alexnet5.autotuner_runs = 1500 Alexnet5.tuner_accuracy = 67.95 -Alexnet5.promise_accuracy = 66.8 +#Alexnet5.promise_accuracy = 66.8 +Alexnet5.promise_accuracy = 70.1 Alexnet5.validation_accuracy = 68.65 bench_tuner_data["vgg16_cifar100"] = Alexnet5 @@ -206,17 +222,18 @@ Alexnet6.start_promise_range = 1 Alexnet6.skip_layer_str = "0" Alexnet6.base_dir = "../build_tuner/tuner_results/lenet_keras/" -Alexnet6.result_dir_1 = "../build_tuner/tuner_results/lenet_keras/loss_1/batch17" -Alexnet6.result_dir_2 = "../build_tuner/tuner_results/lenet_keras/loss_2/batch17" -Alexnet6.result_dir_3 = "../build_tuner/tuner_results/lenet_keras/loss_3/batch17" +Alexnet6.result_dir_1 = "../build_tuner/tuner_results/lenet_keras/loss_1/" + batch_id +Alexnet6.result_dir_2 = "../build_tuner/tuner_results/lenet_keras/loss_2/" + batch_id +Alexnet6.result_dir_3 = "../build_tuner/tuner_results/lenet_keras/loss_3/" + batch_id Alexnet6.tensor_desc_file = "tuner_results/lenet_keras/lenet_tensors.txt" Alexnet6.layer_file = "tuner_results/lenet_keras/lenet_layers.txt" Alexnet6.cost_file = "../build_tuner/tuner_results/lenet_keras/op_cost.txt" +Alexnet6.layer_knobs = "../opentuner/data/lenet/knobs.txt" #Alexnet6.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt" #Alexnet6.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet6.autotuner_runs = 500 +Alexnet6.autotuner_runs = 900 Alexnet6.tuner_accuracy = 98.9 Alexnet6.promise_accuracy = 98.9 Alexnet6.validation_accuracy = 99 @@ -239,20 +256,22 @@ Alexnet7.start_promise_range = 1 #Alexnet7.skip_layer_str = "0" Alexnet7.skip_layer_str = "1_14_0_6_2" Alexnet7.base_dir = "../build_tuner/tuner_results/mobilenet/" -Alexnet7.result_dir_1 = "../build_tuner/tuner_results/mobilenet/loss_1/batch17" -Alexnet7.result_dir_2 = "../build_tuner/tuner_results/mobilenet/loss_2/batch17" -Alexnet7.result_dir_3 = "../build_tuner/tuner_results/mobilenet/loss_3/batch17" +Alexnet7.result_dir_1 = "../build_tuner/tuner_results/mobilenet/loss_1/" + batch_id +Alexnet7.result_dir_2 = "../build_tuner/tuner_results/mobilenet/loss_2/" + batch_id +Alexnet7.result_dir_3 = "../build_tuner/tuner_results/mobilenet/loss_3/" + batch_id Alexnet7.tensor_desc_file = "tuner_results/mobilenet/mobilenet_ops.txt" Alexnet7.layer_file = "tuner_results/mobilenet/mobilenet_layer_comp.txt" Alexnet7.cost_file = "../build_tuner/tuner_results/mobilenet/op_cost.txt" +Alexnet7.layer_knobs = "../opentuner/data/mobilenet/knobs.txt" #--- Files below needed for VALIDATION experiment Alexnet7.loss1_result_file = "tuner_results/mobilenet/loss_1/batch1/promise_tuner/high_confidence/promise_confs.txt" Alexnet7.loss2_result_file = "tuner_results/mobilenet/loss_2/batch1/promise_tuner/high_confidence/promise_confs.txt" -Alexnet7.autotuner_runs = 1000 +Alexnet7.autotuner_runs = 1500 Alexnet7.tuner_accuracy = 84.8 -Alexnet7.promise_accuracy = 84.8 +#Alexnet7.promise_accuracy = 84.8 +Alexnet7.promise_accuracy = 83.65 Alexnet7.validation_accuracy = 84.4 bench_tuner_data["mobilenet_cifar10"] = Alexnet7 @@ -271,27 +290,29 @@ Alexnet8.start_promise_range = 1 #Alexnet8.skip_layer_str = "0" Alexnet8.skip_layer_str = "7_0_1" Alexnet8.base_dir = "../build_tuner/tuner_results/mobilenet_shallow/" -Alexnet8.result_dir_1 = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch17" -Alexnet8.result_dir_2 = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch17" -Alexnet8.result_dir_3 = "../build_tuner/tuner_results/mobilenet_shallow/loss_3/batch17" +Alexnet8.result_dir_1 = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/" + batch_id +Alexnet8.result_dir_2 = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/" + batch_id +Alexnet8.result_dir_3 = "../build_tuner/tuner_results/mobilenet_shallow/loss_3/" + batch_id Alexnet8.tensor_desc_file = "../build_tuner/tuner_results/mobilenet_shallow/mobilenet_shallow_ops.txt" Alexnet8.layer_file = "../build_tuner/tuner_results/mobilenet_shallow/mobilenet_shallow_layer_comp.txt" Alexnet8.cost_file = "../build_tuner/tuner_results/mobilenet_shallow/op_cost.txt" +Alexnet8.layer_knobs = "../opentuner/data/mobilenet_shallow/knobs.txt" Alexnet8.loss1_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch2/promise_tuner/high_confidence/promise_selected_confs.txt" Alexnet8.loss2_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch2/promise_tuner/high_confidence/promise_selected_confs.txt" -Alexnet8.autotuner_runs = 1000 +Alexnet8.autotuner_runs = 1500 Alexnet8.tuner_accuracy = 87.6 -Alexnet8.promise_accuracy = 87.59 +#Alexnet8.promise_accuracy = 87.59 +Alexnet8.promise_accuracy = 89.25 Alexnet8.validation_accuracy = 88.5 bench_tuner_data["mobilenet_shallow"] = Alexnet8 - +""" Alexnet9 = Benchmark() Alexnet9.tuner_binary = "fc4_clipped" Alexnet9.promise_binary = "" @@ -442,6 +463,6 @@ Pipeline5.validation_accuracy = 95 bench_tuner_data["pipeline_GSM"] = Pipeline5 - +""" diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py index ca1772637c0c294386c894238e457edc71c01ca5..6a07ef86e53d2b4b6372e1e253611ba6f018aaad 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py @@ -138,7 +138,7 @@ def loadConfigData(result_dir, baseline_accuracy): config.avg_loss = baseline_accuracy - avg_accuracy config.speedup = speedup config.fname = fname - print ("acc = " + str(avg_accuracy) + "\n") + #print ("acc = " + str(avg_accuracy) + "\n") else: flag = int(x.strip()) config.flags.append(flag) @@ -242,7 +242,8 @@ def buildConfigStr(config, layer_desc): def dumpConfig(layer_desc, config_arrs, result_dir): - f = open(result_dir + "/tuner_confs.txt", "w+") + + f = open(result_dir + "/tuner_confs_11.txt", "w+") it = 1 for config in config_arrs: @@ -274,34 +275,82 @@ def generateConf(Bench): +def dumpBaselineConfs(Bench): + + layer_desc = loadLayerDesc(Bench.layer_file) + + f = open(Bench.base_dir + "/tuner_confs_base.txt", "w+") + + f.write("+++++\n") + f.write("conf" + str(1) + " " + str(1) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n") + + config = Config() + flags = [] + for i in range(Bench.num_layers): + flags.append(11) + + config.flags = flags + config_str = buildConfigStr(config, layer_desc) + + f.write(config_str) + f.write("-----\n") + + + + f.write("+++++\n") + f.write("conf" + str(2) + " " + str(1.5) + " 0 " + str(Bench.promise_accuracy) + " " + str(0) + "\n") + + config = Config() + flags = [] + for i in range(Bench.num_layers): + flags.append(10) + + config.flags = flags + config_str = buildConfigStr(config, layer_desc) + + f.write(config_str) + f.write("-----\n") + + + + + + if __name__ == "__main__": - """ Bench = bench_tuner_data["alexnet_cifar10"] - generateConf(Bench) - + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["alexnet2_cifar10"] - generateConf(Bench) - + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["vgg16_cifar10"] - generateConf(Bench) - + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["vgg16_cifar100"] - generateConf(Bench) - + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["resnet18_cifar10"] - generateConf(Bench) - + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["lenet_keras"] - generateConf(Bench) - - """ + #generateConf(Bench) + dumpBaselineConfs(Bench) + Bench = bench_tuner_data["mobilenet_cifar10"] - generateConf(Bench) + #generateConf(Bench) + dumpBaselineConfs(Bench) - #Bench = bench_tuner_data["mobilenet_shallow"] + Bench = bench_tuner_data["mobilenet_shallow"] #generateConf(Bench) + dumpBaselineConfs(Bench) + diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..2df75fbfc4e7568361747f75f06a4b818a8f99be --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_algo_tuner.py @@ -0,0 +1,102 @@ + + +import os +import subprocess +from error_sensitivity import select_skip_layers + + +def runAlgoTunerCmd(Bench, dir_prefix, result_dir, acc_threshold, autotuner_runs): + + tuner_cmd = "python2 ../opentuner/autotuner/algo_tuner.py " + tuner_cmd += " --test-limit " + tuner_cmd += str(autotuner_runs) + tuner_cmd += " --binary ./" + tuner_cmd += Bench.promise_binary + tuner_cmd += " --num-layers " + tuner_cmd += str(Bench.num_layers) + tuner_cmd += " --result-dir " + tuner_cmd += dir_prefix + tuner_cmd += result_dir + "/algo_tuner/" + tuner_cmd += " --accuracy " + tuner_cmd += str(Bench.promise_accuracy - acc_threshold) + tuner_cmd += " --cost-file " + tuner_cmd += Bench.cost_file + tuner_cmd += " --knobs-config " + tuner_cmd += "../opentuner/data/global_knobs.txt" + tuner_cmd += " --layer-knobs " + tuner_cmd += Bench.layer_knobs + + + print (tuner_cmd) + + p = subprocess.Popen(tuner_cmd, shell=True) + p.wait() + + +""" + +def promiseTunerLoss1(Bench, dir_prefix): + + tuner_runs = int(Bench.autotuner_runs / 3) + + skip_layers1 = "0" + skip_layers2 = "0_" + select_skip_layers(Bench, 30) + skip_layers3 = "0_" + select_skip_layers(Bench, 50) + + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers1) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers2) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs, skip_layers3) + + +def promiseTunerLoss2(Bench, dir_prefix): + + tuner_runs = int(Bench.autotuner_runs / 3) + + skip_layers1 = "0" + skip_layers2 = "0_" + select_skip_layers(Bench, 20) + skip_layers3 = "0_" + select_skip_layers(Bench, 40) + + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers1) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers2) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs, skip_layers3) + + + +def promiseTunerLoss3(Bench, dir_prefix): + + tuner_runs = int (Bench.autotuner_runs / 3) + + skip_layers1 = "0" + skip_layers2 = "0_" + select_skip_layers(Bench, 10) + skip_layers3 = "0_" + select_skip_layers(Bench, 30) + + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3, 2.5, tuner_runs, skip_layers1) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3, 2.5, tuner_runs, skip_layers2) + runPromiseTunerCmd(Bench, dir_prefix, Bench.result_dir_3, 2.5, tuner_runs, skip_layers3) + + +""" + + +BASELINE = True + + +def runAlgoBench(Bench): + + # NOTE-IMP: Changing current directory to one with promise binaries + dir_prefix = "../build_tuner/" + + + if BASELINE: + tuner_runs = Bench.autotuner_runs + runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_1, 0.85, tuner_runs) + runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_2, 1.7, tuner_runs) + runAlgoTunerCmd(Bench, dir_prefix, Bench.result_dir_3, 2.5, tuner_runs) + + else: + promiseTunerLoss1(Bench, dir_prefix) + promiseTunerLoss2(Bench, dir_prefix) + promiseTunerLoss3(Bench, dir_prefix) + + + diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py index 2d4a3bb9ca0189e7889abeca2888f985d1bbe380..73d460be0c4091067c9d52e07ea7f4d421765ff3 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py @@ -5,16 +5,17 @@ import subprocess import shutil from swing_selection import loadLayerDesc -from error_sensitivity import test_sensitivity, test_sensitivity2, test_sensitivity3 +from error_sensitivity import test_sensitivity, test_sensitivity2, test_sensitivity3, test_sensitivity4 from benchmarks import bench_tuner_data from run_psnr import runPSNRTuner from run_ha_tuner import runTunerBench from run_hs_tuner import runPromiseBench +from run_algo_tuner import runAlgoBench from compute_confs import computePSNRBenchSwings, computeBenchSwings from validation import runPromiseBenchValidation, runPromiseBenchValidation2, runBenchValidation from profiling import startProfile, stopProfile, dumpProfiles from utils import createResultDirs - +from benchmarks import batch_id @@ -52,44 +53,44 @@ def computeLayerSwings(): -gpu = 1 + def runPromiseTuner(): - if gpu == 2: - start = startProfile("LeNet") - runPromiseBench(bench_tuner_data["lenet_keras"]) - stopProfile("LeNet", start) - - start = startProfile("Alexnet") - runPromiseBench(bench_tuner_data["alexnet_cifar10"]) - stopProfile("Alexnet", start) + + start = startProfile("MobileNet") + runPromiseBench(bench_tuner_data["mobilenet_cifar10"]) + stopProfile("MobileNet", start) + + start = startProfile("Alexnet") + runPromiseBench(bench_tuner_data["alexnet_cifar10"]) + stopProfile("Alexnet", start) - start = startProfile("Alexnet2") - runPromiseBench(bench_tuner_data["alexnet2_cifar10"]) - stopProfile("Alexnet2", start) + start = startProfile("Alexnet2") + runPromiseBench(bench_tuner_data["alexnet2_cifar10"]) + stopProfile("Alexnet2", start) - start = startProfile("ResNet") - runPromiseBench(bench_tuner_data["resnet18_cifar10"]) - stopProfile("ResNet", start) + start = startProfile("VGG16_10") + runPromiseBench(bench_tuner_data["vgg16_cifar10"]) + stopProfile("VGG16_10", start) - if gpu == 1: - - start = startProfile("VGG16_10") - runPromiseBench(bench_tuner_data["vgg16_cifar10"]) - stopProfile("VGG16_10", start) - - start = startProfile("VGG16_100") - runPromiseBench(bench_tuner_data["vgg16_cifar100"]) - stopProfile("VGG16_100", start) + start = startProfile("VGG16_100") + runPromiseBench(bench_tuner_data["vgg16_cifar100"]) + stopProfile("VGG16_100", start) + + start = startProfile("ResNet") + runPromiseBench(bench_tuner_data["resnet18_cifar10"]) + stopProfile("ResNet", start) - start = startProfile("MobileNet") - runPromiseBench(bench_tuner_data["mobilenet_cifar10"]) - stopProfile("MobileNet", start) + start = startProfile("MobileNet-SH") + runPromiseBench(bench_tuner_data["mobilenet_shallow"]) + stopProfile("MobileNet-SH", start) + + start = startProfile("LeNet") + runPromiseBench(bench_tuner_data["lenet_keras"]) + stopProfile("LeNet", start) + - start = startProfile("MobileNet-SH") - runPromiseBench(bench_tuner_data["mobilenet_shallow"]) - stopProfile("MobileNet-SH", start) #runPSNRPromiseBench("pipeline_GEOM") #runPSNRPromiseBench("pipeline_GEMO") @@ -97,20 +98,47 @@ def runPromiseTuner(): #runPSNRPromiseBench("pipeline_GSM") #runPSNRPromiseBench("pipeline_GSME") - dumpProfiles("time_profile_17.txt") + dumpProfiles("time_profile" + batch_id + ".txt") + + def runPromiseValidation(): - #runPromiseBenchValidation(bench_tuner_data["mobilenet_shallow"]) - - #runPromiseBenchValidation("mobilenet_cifar10") - #runPromiseBenchValidation("resnet18_cifar10") - #runPromiseBenchValidation("alexnet2_cifar10") - #runPromiseBenchValidation("vgg_cifar100") - #runPromiseBenchValidation("vgg16_cifar10") - runPromiseBenchValidation2(bench_tuner_data["lenet_keras"]) + start = startProfile("AlexNet") + runPromiseBenchValidation2(bench_tuner_data["alexnet_cifar10"]) + stopProfile("AlexNet", start) + + start = startProfile("AlexNet2") + runPromiseBenchValidation2(bench_tuner_data["alexnet2_cifar10"]) + stopProfile("AlexNet2", start) + + start = startProfile("VGG16_100") + runPromiseBenchValidation2(bench_tuner_data["vgg16_cifar100"]) + stopProfile("VGG16_100", start) + + start = startProfile("VGG16_10") + runPromiseBenchValidation2(bench_tuner_data["vgg16_cifar10"]) + stopProfile("VGG16_10", start) + #runPromiseBenchValidation2(bench_tuner_data["lenet_keras"]) + + start = startProfile("ResNet") + runPromiseBenchValidation2(bench_tuner_data["resnet18_cifar10"]) + stopProfile("ResNet", start) + + start = startProfile("MobileNet_SH") + runPromiseBenchValidation2(bench_tuner_data["mobilenet_shallow"]) + stopProfile("MobileNet_SH", start) + + start = startProfile("MobileNet") + runPromiseBenchValidation2(bench_tuner_data["mobilenet_cifar10"]) + stopProfile("MobileNet", start) + + + dumpProfiles("validation_prof" + batch_id + ".txt") + + def runAutotuner(): @@ -135,8 +163,44 @@ def runAutotuner(): def runSensAnalysis(): + + start = startProfile("LeNet") + test_sensitivity4(bench_tuner_data["lenet_keras"]) + stopProfile("LeNet", start) """ + start = startProfile("AlexNet") + test_sensitivity4(bench_tuner_data["alexnet_cifar10"]) + stopProfile("AlexNet", start) + + start = startProfile("AlexNet2") + test_sensitivity4(bench_tuner_data["alexnet2_cifar10"]) + stopProfile("AlexNet2", start) + + start = startProfile("ResNet") + test_sensitivity4(bench_tuner_data["resnet18_cifar10"]) + stopProfile("ResNet", start) + + start = startProfile("MobileNet") + test_sensitivity4(bench_tuner_data["mobilenet_cifar10"]) + stopProfile("MobileNet", start) + + start = startProfile("MobileNet_SH") + test_sensitivity4(bench_tuner_data["mobilenet_shallow"]) + stopProfile("MobileNet_SH", start) + + start = startProfile("VGG_10") + test_sensitivity4(bench_tuner_data["vgg16_cifar10"]) + stopProfile("VGG16_10", start) + + start = startProfile("VGG_100") + test_sensitivity4(bench_tuner_data["vgg16_cifar100"]) + stopProfile("VGG16_100", start) + + dumpProfiles("sens_time_prof.txt") + + """ + start = startProfile("LeNet") test_sensitivity3(bench_tuner_data["lenet_keras"]) stopProfile("LeNet", start) @@ -148,8 +212,7 @@ def runSensAnalysis(): start = startProfile("AlexNet2") test_sensitivity3(bench_tuner_data["alexnet2_cifar10"]) stopProfile("AlexNet2", start) - """ - + start = startProfile("ResNet") test_sensitivity3(bench_tuner_data["resnet18_cifar10"]) stopProfile("ResNet", start) @@ -163,7 +226,6 @@ def runSensAnalysis(): test_sensitivity3(bench_tuner_data["mobilenet_shallow"]) stopProfile("MobileNet_SH", start) - """ start = startProfile("VGG_10") test_sensitivity3(bench_tuner_data["vgg16_cifar10"]) stopProfile("VGG16_10", start) @@ -171,9 +233,7 @@ def runSensAnalysis(): start = startProfile("VGG_100") test_sensitivity3(bench_tuner_data["vgg16_cifar100"]) stopProfile("VGG16_100", start) - - """ - + dumpProfiles("sens_time_prof.txt") @@ -200,9 +260,39 @@ def runSensAnalysis(): test_sensitivity(bench_tuner_data["vgg16_cifar100"]) """ - + + +def runAlgoTuner(): + + Bench = bench_tuner_data["alexnet_cifar10"] + runAlgoBench(Bench) + + Bench = bench_tuner_data["mobilenet_shallow"] + runAlgoBench(Bench) + + Bench = bench_tuner_data["mobilenet_cifar10"] + runAlgoBench(Bench) + + Bench = bench_tuner_data["vgg16_cifar10"] + runAlgoBench(Bench) + + #Bench = bench_tuner_data["lenet_keras"] + #runAlgoBench(Bench) + + Bench = bench_tuner_data["alexnet2_cifar10"] + runAlgoBench(Bench) + + + Bench = bench_tuner_data["vgg16_cifar100"] + runAlgoBench(Bench) + + Bench = bench_tuner_data["resnet18_cifar10"] + runAlgoBench(Bench) + + + if __name__ == "__main__": createResultDirs(bench_tuner_data) @@ -213,8 +303,10 @@ if __name__ == "__main__": #computeLayerSwings() - runPromiseTuner() + #runPromiseTuner() + runAlgoTuner() + #runPromiseValidation() #runSensAnalysis() diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py new file mode 100644 index 0000000000000000000000000000000000000000..d787af8ec350b7fa2f2eeb2b0ed4c3ae4c015c95 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_different_clock_frequencies_testing_automator.py @@ -0,0 +1,139 @@ +# Automates online benchmark testing with different clock speeds +# Input: GPU clock speed, DDR clock speed, set of benchmark names to test +# Set of benchmarks format: (full_bin_name, half_bin_name) +import os +import sys + +from subprocess import Popen, PIPE + +def set_clock_speeds(gpu_speed_mhz, ddr_speed_mhz): + def find_closest_clock_speed(goal_speed): + # Reads /sys/devices/17000000.gp10b/devfreq/17000000.gp10b/available_frequencies + # and finds the closest clock speed + AVAIL_FREQS = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/available_frequencies" + avail_freqs_file = open(AVAIL_FREQS, "r") + avail_speeds_lst = avail_freqs_file.read().strip().split() + avail_freqs_file.close() + + min_diff = abs(gpu_speed - int(avail_speeds_lst[0])) + closest_speed = int(avail_speeds_lst[0]) + for avail_speed in avail_speeds_lst[1:]: + avail_speed = int(avail_speed) + curr_diff = abs(gpu_speed - avail_speed) + if curr_diff < min_diff: + min_diff = curr_diff + closest_speed = avail_speed + return closest_speed + + new_conf_filename = 'jetson_clocks_conf%d_%d.txt' % (gpu_speed_mhz, ddr_speed_mhz) + curr_conf_filename = "jetson_clocks_conf_backup.txt" + if os.path.isfile(curr_conf_filename): + os.remove(curr_conf_filename) + + # Get the current configurations in a file + sudo_password = 'nvidia' + p = Popen(['sudo', '/home/nvidia/jetson_clocks.sh', '--store', curr_conf_filename], \ + stdin=PIPE, universal_newlines=True) + p.communicate(sudo_password + '\n') + assert p.returncode == 0 + + # Read the current config file in + curr_conf_file = open(curr_conf_filename, "r") + curr_confs = curr_conf_file.read().strip().split('\n') + curr_conf_file.close() + + GPU_MIN_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq" + GPU_MAX_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq" + GPU_CUR_FREQ = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq" + + DDR_UPDATE_PATH = "/sys/kernel/debug/bpmp/debug/clk/emc/rate" + + # Copy everything in the old configuration except for the GPU/DDR lines + new_conf_file = open(new_conf_filename, "w") + for line in curr_confs: + # Write the GPU clock frequencies at the end to configure the clocks even if + # the current configuration doesn't have one of the lines + if line.startswith(GPU_MIN_FREQ) or line.startswith(GPU_MAX_FREQ) or \ + line.startswith(GPU_CUR_FREQ) or line.startswith(DDR_UPDATE_PATH): + continue + else: + new_conf_file.write("%s\n" % line) + + MHZ_TO_HZ_MULT = 1000000 + gpu_speed = gpu_speed_mhz * MHZ_TO_HZ_MULT + ddr_speed = ddr_speed_mhz * MHZ_TO_HZ_MULT + + # Set GPU + closest_gpu_speed = find_closest_clock_speed(gpu_speed) + print("Setting GPU speed to %d" % closest_gpu_speed) + new_conf_file.write("%s:%d\n" % (GPU_MIN_FREQ, closest_gpu_speed)) + new_conf_file.write("%s:%d\n" % (GPU_MAX_FREQ, closest_gpu_speed)) + #new_conf_file.write("%s:%d\n" % (GPU_CUR_FREQ, closest_gpu_speed)) + + # Set DDR + new_conf_file.write("%s:%d\n" % (DDR_UPDATE_PATH, ddr_speed)) + new_conf_file.close() + + # Set the new configuration + p = Popen(['sudo', '/home/nvidia/jetson_clocks.sh', '--restore', new_conf_filename], \ + stdin=PIPE, universal_newlines=True) + p.communicate(sudo_password + '\n') + assert p.returncode == 0 + print("SUCCESSFULLY SET CLOCK SPEEDS") + + +def run_benchmark(bin_name, should_print_bin_output): + print("RUNNING %s" % bin_name) + proc = Popen("./%s" % bin_name, stdout = PIPE, universal_newlines = True) + proc_output = proc.communicate()[0] + assert proc.returncode == 0 + + if should_print_bin_output: + print(proc_output) + print("FINISHED RUNNING %s" % bin_name) + return proc_output + + +def parse_binary_output(proc_output): + avg_time_key_ind = proc_output.find("Average time:") + assert avg_time_key_ind >= 0 + avg_time = proc_output[avg_time_key_ind : proc_output.find("\n", avg_time_key_ind)] + print(avg_time) + return avg_time + + +# Input: a list of tuples of benchmark names +# Can change to input a file containing benchmarks to run +def run_benchmarks(benchmarks_filename, output_filename, should_print_bin_output): + benchmarks_file = open(benchmarks_filename, "r") + output_file = open(output_filename, "w") + + def parse_binary_names_tuple(tuple_line): + tuple_line = tuple_line.replace("(", "").replace(")", "").strip().split(',') + return tuple_line[0].strip(), tuple_line[1].strip() + + for line in benchmarks_file: + full_bin_name, half_bin_name = parse_binary_names_tuple(line) + output_file.write("%s: %s\n" % (full_bin_name, \ + parse_binary_output(run_benchmark(full_bin_name, should_print_bin_output)))) + output_file.write("%s: %s\n" % (half_bin_name, \ + parse_binary_output(run_benchmark(half_bin_name, should_print_bin_output)))) + + benchmarks_file.close() + output_file.close() + + +if __name__ == "__main__": + num_args = len(sys.argv) + + if num_args != 5 and num_args != 6: + print("Usage: python online_benchmark_testing_automator.py <gpu freq in MHz> <ddr freq in MHz> <binary_names_file> <output_file> [1 to print binary output]") + print("Binary names file format: (full_binary_name, half_binary_name)<newline>") + exit(1) + print("GPU clock speed: %s" % sys.argv[1]) + print("DDR clock speed: %s" % sys.argv[2]) + print("Benchmarks file name: %s" % sys.argv[3]) + print("Output file name: %s" % sys.argv[4]) + + set_clock_speeds(int(sys.argv[1]), int(sys.argv[2])) + run_benchmarks(sys.argv[3], sys.argv[4], num_args == 6 and sys.argv[-1] == "1") diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py new file mode 100644 index 0000000000000000000000000000000000000000..f1f00f4e285fbf487fee03bfee72dbe1a84ea55a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py @@ -0,0 +1,72 @@ +# Automates online benchmark testing with different clock speeds +# Input: set of benchmark names to test +# Set of benchmarks format: (full_bin_name, half_bin_name) +import os +import sys + +from collections import defaultdict +from subprocess import Popen, PIPE + +def run_benchmark(bin_name, should_print_bin_output): + print("RUNNING %s" % bin_name) + proc = Popen("./%s" % bin_name, stdout = PIPE, universal_newlines = True) + proc_output = proc.communicate()[0] + assert proc.returncode == 0 + + if should_print_bin_output: + print(proc_output) + print("FINISHED RUNNING %s" % bin_name) + return proc_output + + +def parse_binary_output(proc_output): + avg_time_key_ind = proc_output.find("Average time:") + assert avg_time_key_ind >= 0 + avg_time = proc_output[avg_time_key_ind : proc_output.find("\n", avg_time_key_ind)] + print(avg_time) + return avg_time + + +def get_sorted_binaries(builds_dir): + # dict of network names to lists of binaries + # list of binaries should be in sorted order (can do that when we run the benchmarks) + network_bins = defaultdict(list) + for bin_name in os.listdir(builds_dir): + if bin_name.find("profiling") == -1: + continue + network_name = bin_name[ : bin_name.rfind("_")] + network_bins[network_name].append(bin_name) + return network_bins + +# Input: a list of tuples of benchmark names +# Can change to input a file containing benchmarks to run +def run_benchmarks(sorted_bins, builds_dir, output_filename, should_print_bin_output = False): + def get_knob_id(bin_name): + return int(bin_name[bin_name.rfind("_") + 1 : ]) + + output_file = open(output_filename, "w", buffering = 0) + for network_name in sorted_bins: + # Sort the binaries in order by knob id + sorted_bins[network_name].sort(key = get_knob_id) + print("--------------------------------------") + print(network_name) + # Go through all binaries + for bin_name in sorted_bins[network_name]: + print(bin_name) + output_file.write("%s results\n" % bin_name) + output_file.write("%s: %s\n" % (bin_name, \ + parse_binary_output(run_benchmark(os.path.join(builds_dir, bin_name), \ + should_print_bin_output)))) + print("--------------------------------------\n") + output_file.close() + + +if __name__ == "__main__": + num_args = len(sys.argv) + + if num_args != 3: + print("Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name>") + exit(1) + print("Output file name: %s" % sys.argv[2]) + sorted_bins = get_sorted_binaries(sys.argv[1]) + run_benchmarks(sorted_bins, sys.argv[1], sys.argv[2]) diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..04f6c5eec378276cd0c89fcc7013cb6996a90f2f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/cmakelists_generator.py @@ -0,0 +1,109 @@ +# Generates a CMakeLists.txt file for all generated files in a specific directory +# Input: Arbitrarily long list containing names of all generated files directories +# Ex: alexnet_cifar10_autogenerated_knobs mobilenet_cifar10_autogenerated_knobs +# If inputted 0 parameters: Generates CMakeLists.txt file for all generated files in CURRENT dir + +import sys +import os + +def get_all_generated_directory_names(): + ''' + Returns a list of all generated source code directories (<>_autogenerated_knobs) + in the current directory. Called when program is run with 0 args + ''' + generated_dir_names = [] + for dir_name in os.listdir("."): + print(dir_name) + if dir_name.endswith("autogenerated_knobs"): + generated_dir_names.append(dir_name) + return generated_dir_names + + +def generate_cmakelists_setup(cmakelists_file): + ''' + Copies over all the setup instructions (ex: finding libraries) from a "base" CMakeLists.txt + file. Ends copyng when we find the first instance of add_executable + + Args: + cmakelists_file: File object to write cmake instructions to + + Assumption: All setup instructions are being any add_executable instructions + ''' + BASE_CMAKELISTS_PATH = "/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt" + base_cmakelists_file = open(os.path.join(BASE_CMAKELISTS_PATH, "CMakeLists.txt"), "r") + + find_lib_line = "" + + for line in base_cmakelists_file: + if line.find("add_executable") != -1: + break + + elif line.startswith("#"): + continue + + # Special case: ignore / if -I flag exists + elif line.find("/") != -1 and line.find("-I") == -1: + dot_dot_slash_ind = line.find("../") + dot_slash_ind = line.find("./") + if dot_dot_slash_ind != -1: + start_ind = dot_dot_slash_ind + elif dot_slash_ind != -1: + start_ind = dot_slash_ind + else: + slash_ind = line.find("/") + prev_space_ind = line[:slash_ind].rfind(" ") + start_ind = prev_space_ind + 1 + + old_rel_path = [] + while start_ind < len(line): + if line[start_ind] == ")" or line[start_ind].isspace(): + break + old_rel_path.append(line[start_ind]) + start_ind += 1 + old_rel_path = ''.join(old_rel_path) + if os.path.isabs(old_rel_path): + cmakelists_file.write(line) + else: + new_path = os.path.join(BASE_CMAKELISTS_PATH, old_rel_path) + cmakelists_file.write(line.replace(old_rel_path, new_path)) + continue + cmakelists_file.write(line) + base_cmakelists_file.close() + + +def generate_cmakelists_file(cmakelists_file, source_file_dirs): + generate_cmakelists_setup(cmakelists_file) + LIBRARIES = "tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}" + cmake_instrs = [] + + for source_file_dir in source_file_dirs: + cmake_instrs.append("# %s" % source_file_dir) + for source_file in os.listdir(source_file_dir): + # Executable name = name of source code file without file extension + file_ext_ind = source_file.find(".cc") + if file_ext_ind == -1: + print("WARNING: Found file with wrong extension. Skipping. %s" % source_file) + continue + exec_name = source_file[ : file_ext_ind] + + source_file_path = os.path.join(source_file_dir, source_file) + cmake_instrs.append("add_executable(%s %s)" % (exec_name, source_file_path)) + cmake_instrs.append("target_link_libraries(%s %s)\n" % (exec_name, LIBRARIES)) + cmake_instrs.append("\n") + cmakelists_file.write('\n'.join(cmake_instrs)) + + +if __name__ == "__main__": + num_args = len(sys.argv) + + if num_args >= 2 and sys.argv[1] == "--usage": + print("python cmakelists_generator.py <names of all generated files directories>") + print("If given no parameters: Generates CMakeLists.txt file for all generated files in CURRENT directory") + exit(1) + + cmakelists_file = open("CMakeLists.txt", "w") + if num_args == 1: + generate_cmakelists_file(cmakelists_file, get_all_generated_directory_names()) + else: + generate_cmakelists_file(cmakelists_file, sys.argv[1:]) + cmakelists_file.close() diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py new file mode 100644 index 0000000000000000000000000000000000000000..d587a3b7b57b96c8eb61b2e3e63709c7745ed466 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/source_code_autogenerator.py @@ -0,0 +1,295 @@ +# Input: file of the following table format +# id knob configurations (arbitrary # of columns) orig_func_name new_func_name +# Input: file containing list of filenames to generate modified sources for +# Generates: +# a new directory called <original_source_nane>_different_knobs +# files named <original_source_name>_<id>.txt within their respective directories + +import glob +import sys +import os +import re + +class Approx: + FP32 = 0 + FP16 = 1 + PERF = 2 + SAMP = 3 + +class KnobConfiguration: + ''' + Stores the configurations as well as other useful information for each knob configuration + Stores: id (may factor out if ids are guaranteed to start at 0/1 and be consecutive) + original function name + modified function name + new function parameters (knobs) + new function call (modified function name(knobs)) + ''' + def __init__(self, raw_config): + ''' + Args: raw_config = line of configuration file to parse + ''' + line_as_lst = raw_config.strip().split() + # approx,<id> knob1,knob2,etc IGNORE old_fun_name new_fun_name + + approx_id_lst = line_as_lst[0].split(',') + assert len(approx_id_lst) == 2 + + self.id = int(approx_id_lst[1]) + + if approx_id_lst[0] == "fp32": + self.approx = Approx.FP32 + return # special case + elif approx_id_lst[0] == "fp16": + self.approx = Approx.FP16 + return # special case + elif approx_id_lst[0] == "perf": + self.approx = Approx.PERF + elif approx_id_lst[0] == "samp": + self.approx = Approx.SAMP + + self.orig_func_name = line_as_lst[-2] # Second to last element + self.modified_func_name = line_as_lst[-1] # Last element + self.params = line_as_lst[1].split(",") # First element = knob configuration + + + # DEBUG + def __repr__(self): + if self.approx == Approx.FP32: + return "FP32" + elif self.approx == Approx.FP16: + return "FP16" + + approx_type = None + if self.approx == Approx.PERF: + approx_type = "PERF" + elif self.approx == Approx.SAMP: + approx_type = "SAMP" + return "Approx: %s, ID: %d, Orig func nane: %s, Modified func nane: %s, Params: %s" \ + % (approx_type, self.id, self.orig_func_name, self.modified_func_name, \ + ', '.join(self.params)) + + +def get_new_path(old_path, orig_source_code_dir): + ''' + Returns a path that's compatible with the location of the generated source code + + Args: + old_path: Original path of file that's being included + orig_source_code_dir: Path to original source code dir wrt the current dir + ''' + if os.path.isabs(old_path): # Old path works + return old_path + # Adding an extra .. because the path should be wrt the generated directory + return os.path.join("..", orig_source_code_dir, old_path) + + +# "complete_line" = a valid line of code +def get_new_function_calls(complete_line, knob_config): + ''' + Returns a copy of an inputted line of code such that all instances of old + function calls are replaced with newFunctionCall(old params, knobs) + + Note: The old calls aren't completely overriden, as we still need the old parameters but + insert new parameters as well + + Args: + complete_line: A complete line of code to process + knob_config: KnobConfiguration object representing current configuration + ''' + orig_func_ind = complete_line.find(knob_config.orig_func_name) + new_line = [] + line_start_ind = 0 + last_ind = 0 + + while orig_func_ind != -1: + new_line.append(complete_line[line_start_ind : orig_func_ind]) + line_start_ind = complete_line.find(")", orig_func_ind) + 1 + + old_func_call = complete_line[complete_line.find("(", orig_func_ind): line_start_ind] + new_line.append("%s%s, %s)" % (knob_config.modified_func_name, old_func_call[:-1], ', '.join(knob_config.params))) + orig_func_ind = complete_line.find(knob_config.orig_func_name, line_start_ind) + new_line.append(complete_line[line_start_ind : ]) + return ''.join(new_line) + + +def convert_local_paths(file_contents, orig_source_dir): + ''' + Converts all local paths wrt the original source file's directory to paths compatible + with the current source code directory + + Args: + file_contents: String containing source code read from file + orig_source_dir: Path of original source code dir wrt the current directory + ''' + last_include_ind = file_contents.rfind("#include") + last_include_newline_ind = file_contents.find("\n", last_include_ind) + include_lines = file_contents[ : last_include_newline_ind].split("\n") + + new_file_contents = [] + for line in include_lines: + if line.startswith("#"): + include_file = line.split()[1] + if include_file.startswith("\""): + new_include_path = get_new_path(include_file.replace("\"", ""), orig_source_dir.replace("\"", "")) + new_file_contents.append("#include \"%s\"\n" % new_include_path) + else: + new_file_contents.append(line) + new_file_contents.append(file_contents[last_include_newline_ind : ]) + return '\n'.join(new_file_contents) + + +def generate_fp32_source(new_file, source_file, orig_source_dir): + # Copy the source code over + new_file_contents = convert_local_paths(source_file.read(), orig_source_dir) + new_file.write(new_file_contents) + + +def generate_fp16_source(knob_config, new_file, source_file, orig_source_dir): + file_contents = source_file.read() + + new_file_contents = convert_local_paths(file_contents, orig_source_dir) + + # Replace all tensorOperation calls with tensorHalfOperation calls + # Derived from ../bin/replace_half_calls.py + # NOTE: Not very portable but I don't see another way of ONLY replacing tensorOperation FUNCTION calls + new_file_contents = new_file_contents.replace("tensorConvolution", "tensorHalfConvolution") + new_file_contents = new_file_contents.replace("tensorAdd", "tensorHalfAdd") + new_file_contents = new_file_contents.replace("tensorRelu", "tensorHalfRelu") + new_file_contents = new_file_contents.replace("tensorRelu2", "tensorHalfRelu2") + new_file_contents = new_file_contents.replace("tensorTanh", "tensorHalfTanh") + new_file_contents = new_file_contents.replace("tensorPooling", "tensorHalfPooling") + new_file_contents = new_file_contents.replace("tensorGemmGPU", "tensorHalfGemmGPU") + + new_file.write(new_file_contents) + + +def generate_approx_source(knob_config, new_file, source_file, orig_source_dir): + new_file_contents = [] + + # Store complete line to handle cases where one line of code is split into two lines + complete_line = "" + for line in source_file: + # Replace the current path of the local include with a path that's compatible + # with the location of the generated source code + if line.startswith("#"): + include_file = line.split()[1] + if include_file.startswith("\""): + new_include_path = get_new_path(include_file.replace("\"", ""), orig_source_dir.replace("\"", "")) + new_file_contents.append("#include \"%s\"\n" % new_include_path) + else: + new_file_contents.append(line) + continue + # Handles case where 1 actual line of code is split into 2 lines + elif line.find("}") != -1 or line.find("{") != -1: + complete_line += line + new_file_contents.append(complete_line) + complete_line = "" + continue + elif line.find(";") == -1: # Last char is always \n + complete_line += line + continue + + complete_line += line + orig_func_ind = complete_line.find(knob_config.orig_func_name) + if orig_func_ind != -1: + new_file_contents.append(get_new_function_calls(complete_line, knob_config)) + else: + new_file_contents.append(complete_line) + complete_line = "" + new_file.write(''.join(new_file_contents)) + + +def generate_source_code(table, dir_name, filename, source_name): + ''' + Generates source code for all configurations in the table for one original source + Args + table: List of KnobConfigurations + dir_name: Directory new sources should be placed in + filename: Filename of original source + source_name: Filename without the file extension (ex: foo/blah.cc --> blah) + ''' + source_file = open(filename, "r") + orig_source_dir = os.path.dirname(filename) + + for knob_config in table: + source_file.seek(0, 0) + new_filename = os.path.join(dir_name, "%s_%s.cc" % (source_name, knob_config.id)) + new_file = open(new_filename, "w") + if knob_config.approx == Approx.FP16: + generate_fp16_source(knob_config, new_file, source_file, orig_source_dir) + elif knob_config.approx == Approx.FP32: + generate_fp32_source(new_file, source_file, orig_source_dir) + else: + generate_approx_source(knob_config, new_file, source_file, orig_source_dir) + + new_file.close() + print("Generated source code as %s" % new_filename) + source_file.close() + + +def generate_all_sources(table, orig_files_filename): + ''' + Generates directories and source code for all original sources for all knob configurations + Args: + table: List of KnobConfiguration objects + orig_files_filename: Filename of file containing all original source names to generate new + sources for + ''' + orig_files = open(orig_files_filename, "r") + for orig_filename in orig_files: + orig_filename = orig_filename.strip() + + # Source name = original filename without the .cc + last_slash_ind = orig_filename.rfind("/") + file_ext_ind = orig_filename.find(".cc") + if last_slash_ind == -1: + source_name = orig_filename[ : file_ext_ind] + else: + source_name = orig_filename[last_slash_ind + 1 : file_ext_ind] + print("Source name: %s" % source_name) + + # Start with a clean directory + dir_name = "%s_autogenerated_knobs" % source_name + print("Setting up directory: %s" % dir_name) + if os.path.isdir(dir_name): + print("Directory exists: clearing everything") + for old_file in glob.glob(os.path.join(dir_name, "*")): + os.remove(old_file) + + else: + print("Generating directory: %s" % dir_name) + os.makedirs(dir_name) + + generate_source_code(table, dir_name, orig_filename, source_name) + print("\n") + orig_files.close() + + +def parse_table(table_filename): + ''' + Given the filename of a table, parses the table into a list of KnobConfigurations + ''' + # Can we assume that the ids always start at 1 --> if so, can index by knobs + # else: need to use a dict + table = [] + table_file = open(table_filename, "r") + for raw_config in table_file: + table.append(KnobConfiguration(raw_config)) + table_file.close() + return table + + +if __name__ == "__main__": + num_args = len(sys.argv) + if num_args != 3: + print("Usage: python source_code_autogenerator.py <table file> <original filenames file>") + if num_args >= 2 and sys.argv[1] == "--usage": + print("Table file format: <id> <knob configurations separated by spaces> <orig func name> <new func name>") + print("Original filenames file: <original_filename><newline> etc") + else: + print("Run with --usage flag for more detailed information") + exit(1) + + table = parse_table(sys.argv[1]) + generate_all_sources(table, sys.argv[2]) diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc new file mode 100644 index 0000000000000000000000000000000000000000..7c9583f291ea908c4c89a8b56045e06585a4f83a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc @@ -0,0 +1,185 @@ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + + +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + + +bool Opentuner_run = false; + +int total_runs = 1; + + +/* NOTE: Reference Architecture to use for profiling */ +void testLenetTanh(){ + + if(Opentuner_run){ + total_runs = 1000000; + } + + + printf("********* Lenet-2 Architecture ********** \n"); + // FIXIT: Extend this to batch of images - currently 5 images + + int test_batch_size = 1000; + + uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size); + + void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte", + CUDNN_DATA_FLOAT, + test_batch_size, 1, 28, 28); + + // NOTE: Filter descriptors do NOT have batch size + // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels) + // IMP: The output channels matches the trained model - not the Lenet arch proposed in Andrew Ng's class + void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin", + float_type, 32, 1, 5, 5); + void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/lenet_keras/conv2.bin", + float_type, 64, 32, 5, 5); + void* conv2_bias = readTrainedWeights("../model_params/lenet_keras/conv2_bias.bin", + float_type, 1, 64, 1, 1); + void* fc1_weights = readTrainedWeights("../model_params/lenet_keras/fc1.bin", + float_type, 1, 1, 7*7*64, 1024); + void* fc1_bias = readTrainedWeights("../model_params/lenet_keras/fc1_bias.bin", + float_type, 1, 1024, 1, 1); + void* fc2_weights = readTrainedWeights("../model_params/lenet_keras/fc2.bin", + float_type, 1, 1, 1024, 10); + void* fc2_bias = readTrainedWeights("../model_params/lenet_keras/fc2_bias.bin", + float_type, 1, 10, 1, 1); + + + + clearTensorMap(); + + for(int i = 0; i < total_runs; i++){ + + if(Opentuner_run){ + + const char* myfifo = "/tmp/myfifo"; + int fd = open(myfifo, O_RDONLY); + + int ret_val = fcntl(fd, F_GETFD); + if(ret_val == -1){ + printf("Invalid descriptor \n"); + abort(); + } + + char str[100]; + read(fd, str, 80); + if(strcmp(str, "stop_run") == 0){ + abort(); + } + + close(fd); + } + + + readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters + + // Start power and performnce profiling + startProfiling(); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + // NOTE: 'SAME' convolution + //void* conv1out = tensorConvPerfCuda(input, conv1_filter, 2, 2, 1, 1, + // conv_mode, conv_precision, 2, 2, 1); + + void* conv1out = tensorConvSampSim(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision, 4, 0); + + // NOTE: For tensorAdd, the only dimension that MUST match is channels + tensorAdd(conv1out, conv1_bias); // NOTE: In place operation + + void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + + void* conv1_tanh = tensorTanh(pool1out); + + // NOTE: input channels have to match between tensor op inputs and outputs + //void* conv2out = tensorConvPerfCuda(conv1_tanh, conv2_filter, 2, 2, 1, 1, + // conv_mode, conv_precision, 1, 2, 1); + + void* conv2out = tensorConvSampSim(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision, 2, 0); + + tensorAdd(conv2out, conv2_bias); // NOTE: In place operation + + void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void* conv2_tanh = tensorTanh(pool2out); + + void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); + + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + + void* tanh1out = tensorTanh(gemm1biasout); + + void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights); + + void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + + void* tanh2out = tensorTanh(gemm2_biasout); + + void* result = tensorSoftmax(tanh2out); + + // End profiling and dump output to profile.txt + stopProfiling(); + + float accuracy = computeAccuracy2(labels, test_batch_size, result); + dumpFinalAccuracy(accuracy); + + + //FIXME: remove the comment below to use piped autotuner + //dumpAccuracyNorms(); + freeOutputTensors(); + + if(Opentuner_run){ + + const char* myfifo = "/tmp/myfifo"; + int fd_out = open(myfifo, O_WRONLY); + int ret_val = fcntl(fd_out, F_GETFD); + if(ret_val == -1){ + printf("Invalid descriptor \n"); + abort(); + } + + const char* str = "completed***!\n\0"; + write(fd_out, str, 80); + close(fd_out); + } + + } + + dumpExecutionAccuracies(); + + +} + + + +int main(int argc, char* argv[]){ + + if (argc > 1){ + total_runs = atoi(argv[1]); + } + + llvm_hpvm_initTensorRt(0); + + testLenetTanh(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6ab2aed33b13a249214d94508e193d0b6049aaf --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling.cc @@ -0,0 +1,162 @@ +// Per tensor operation + +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +/* NOTE: Reference Architecture to use for profiling */ +void testCifarNet(){ + + printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); + + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); + + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + startMemTracking(); + + int total_runs = 100; + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + // FIRST Tensor Runtime CALL + profiler.resume_profiler(); + void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv1out, conv1_bias); + void* conv1_tanh = tensorTanh(conv1out); + + // 2nd Layer + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2_bias); + void* conv2_tanh = tensorTanh(conv2out); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 3rd Layer + void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv3out, conv3_bias); + void* conv3_tanh = tensorTanh(conv3out); + + // 4th Layer + void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv4out, conv4_bias); + void* conv4_tanh = tensorTanh(conv4out); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 5th Layer + void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv5out, conv5_bias); + void* conv5_tanh = tensorTanh(conv5out); + + // 6th Layer + void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv6out, conv6_bias); + void* conv6_tanh = tensorTanh(conv6out); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + + // final FC Layer + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void* result = tensorSoftmax(gemm1biasout); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + stopProfiling(); + final_accuracy = (final_accuracy / batch_count) / total_runs; + dumpFinalAccuracy(final_accuracy); +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testCifarNet(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc new file mode 100644 index 0000000000000000000000000000000000000000..f95a7bda4fc581e4c40d4882304156f2420f22a5 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc @@ -0,0 +1,262 @@ +// Per tensor operation + +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +void add_data(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, Profiler& profiler, const std::string& op_name){ + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + + auto itr = total_time_energies.find(op_name); + if (itr == total_time_energies.end()){ + total_time_energies.insert(std::make_pair(op_name, time_energy)); + } else { + itr->second.first += time_energy.first; + itr->second.second += time_energy.second; + } + profiler.reset(); +} + +/* NOTE: Reference Architecture to use for profiling */ +void testCifarNet(){ + + printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); + + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); + + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + std::ofstream online_profiler_output; + online_profiler_output.open("online_output.txt"); + + startMemTracking(); + + // NOTE: CHANGED INPUT TO STANDARDIZE + int total_runs = 50; // FOR NOW 100; + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + Profiler profiler; + profiler.start_profiler(); + + // Get the total time and energy per tensor per run + std::unordered_map<std::string, std::pair<double, double> > total_time_energies; + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + // FIRST Tensor Runtime CALL + profiler.resume_profiler(); + void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv1"); + + profiler.resume_profiler(); + tensorAdd(conv1out, conv1_bias); + add_data(total_time_energies, profiler, "Add1"); + + profiler.resume_profiler(); + void* conv1_tanh = tensorTanh(conv1out); + add_data(total_time_energies, profiler, "Tanh1"); + + // 2nd Layer + profiler.resume_profiler(); + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv2"); + + profiler.resume_profiler(); + tensorAdd(conv2out, conv2_bias); + add_data(total_time_energies, profiler, "Add2"); + + profiler.resume_profiler(); + void* conv2_tanh = tensorTanh(conv2out); + add_data(total_time_energies, profiler, "Tanh2"); + + profiler.resume_profiler(); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool1"); + + // 3rd Layer + profiler.resume_profiler(); + void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv3"); + + profiler.resume_profiler(); + tensorAdd(conv3out, conv3_bias); + add_data(total_time_energies, profiler, "Add3"); + + profiler.resume_profiler(); + void* conv3_tanh = tensorTanh(conv3out); + add_data(total_time_energies, profiler, "Tanh3"); + + // 4th Layer + profiler.resume_profiler(); + void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv4"); + + profiler.resume_profiler(); + tensorAdd(conv4out, conv4_bias); + add_data(total_time_energies, profiler, "Add4"); + + profiler.resume_profiler(); + void* conv4_tanh = tensorTanh(conv4out); + add_data(total_time_energies, profiler, "Tanh4"); + + profiler.resume_profiler(); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool2"); + + // 5th Layer + profiler.resume_profiler(); + void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv5"); + + profiler.resume_profiler(); + tensorAdd(conv5out, conv5_bias); + add_data(total_time_energies, profiler, "Add5"); + + profiler.resume_profiler(); + void* conv5_tanh = tensorTanh(conv5out); + add_data(total_time_energies, profiler, "Tanh5"); + + // 6th Layer + profiler.resume_profiler(); + void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv6"); + + profiler.resume_profiler(); + tensorAdd(conv6out, conv6_bias); + add_data(total_time_energies, profiler, "Add6"); + + profiler.resume_profiler(); + void* conv6_tanh = tensorTanh(conv6out); + add_data(total_time_energies, profiler, "Tanh6"); + + profiler.resume_profiler(); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool3"); + + // final FC Layer + profiler.resume_profiler(); + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + add_data(total_time_energies, profiler, "Mul1"); // ASSUMING that this is mul1 + + std::cout<<"-----------------------------------ADD 7--------------------------------\n"; + profiler.resume_profiler(); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + add_data(total_time_energies, profiler, "Add7"); + std::cout<<"-----------------------------------ADD 7 ENDS --------------------------------\n"; + + profiler.resume_profiler(); + void* result = tensorSoftmax(gemm1biasout); + add_data(total_time_energies, profiler, "Softmax1"); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + stopProfiling(); + //online_profiler_output << "Total time: " << total_time << ", " << total_energy << "\n"; + // Now compute the averages across batches + std::ofstream ofs; + std::string arr[] = {"Add1", "Add2", "Add3", "Add4", "Add5", "Add6", "Add7", + "Conv1", "Conv2", "Conv3", "Conv4", "Conv5", "Conv6", + "Mul1", + "Pool1", "Pool2", "Pool3", + "Softmax1", + "Tanh1", "Tanh2", "Tanh3", "Tanh4", "Tanh5", "Tanh6"}; + ofs.open("online_profiler_tensor_data.txt"); + std::vector<std::string> ordered_keys(std::begin(arr), std::end(arr)); + for (const std::string& key : ordered_keys){ + const auto& data_pair = total_time_energies[key]; + ofs << key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n'; + std::cout<< key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n'; + } + + /* + ofs.open("online_profiler_tensor_data.txt"); + for (const auto& tensor_data : total_time_energies){ + ofs << tensor_data.first << ": " << tensor_data.second.first / total_runs << "\t" << tensor_data.second.second / total_runs << '\n'; + }*/ + ofs.close(); + final_accuracy = (final_accuracy / batch_count) / total_runs; + dumpFinalAccuracy(final_accuracy); + online_profiler_output.close(); +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testCifarNet(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..eee98920bdfde1de5e769b038c87432fc4d269e1 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet_cifar10_profiling.cc @@ -0,0 +1,124 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + + std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + //uint8_t* labels = readLabels(labels_path.c_str(),10000); + std::string conv2d_1_w_path = dir_prefix + std::string("conv0.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv_bias0.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv3.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv_bias3.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv6.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv_bias6.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv7.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv_bias7.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv8.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv_bias8.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); + std::string dense_1_w_path = dir_prefix + std::string("fc12.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); + std::string dense_1_b_path = dir_prefix + std::string("fc_bias12.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + int total_runs = 100; + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + profiler.resume_profiler(); + void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); + void* var_1 = tensorAdd(var_0, conv2d_1_b); + void* var_2 = tensorTanh(var_1); + void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); + void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); + void* var_6 = tensorAdd(var_5, conv2d_2_b); + void* var_7 = tensorTanh(var_6); + void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); + void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void* var_11 = tensorAdd(var_10, conv2d_3_b); + void* var_12 = tensorTanh(var_11); + void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void* var_14 = tensorAdd(var_13, conv2d_4_b); + void* var_15 = tensorTanh(var_14); + void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void* var_17 = tensorAdd(var_16, conv2d_5_b); + void* var_18 = tensorTanh(var_17); + void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); + void* var_22 = tensorGemmGPU(var_19, dense_1_w); + void* var_23 = tensorAdd(var_22, dense_1_b); + void* var_24 = tensorSoftmax(var_23); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels,batch_size,var_24); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee323d068f60413090433ec013c985acafbd3406 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/lenet_keras_profiling.cc @@ -0,0 +1,182 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + + +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + + +bool Opentuner_run = false; + + +/* NOTE: Reference Architecture to use for profiling */ +void testLenetTanh(){ + + int total_runs = 100; + + printf("********* Lenet-2 Architecture ********** \n"); + // FIXIT: Extend this to batch of images - currently 5 images + + int test_batch_size = 5000; + + uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size); + + void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte", + CUDNN_DATA_FLOAT, + test_batch_size, 1, 28, 28); + + // NOTE: Filter descriptors do NOT have batch size + // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels) + // IMP: The output channels matches the trained model - not the Lenet arch proposed in Andrew Ng's class + void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin", + float_type, 32, 1, 5, 5); + void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/lenet_keras/conv2.bin", + float_type, 64, 32, 5, 5); + void* conv2_bias = readTrainedWeights("../model_params/lenet_keras/conv2_bias.bin", + float_type, 1, 64, 1, 1); + void* fc1_weights = readTrainedWeights("../model_params/lenet_keras/fc1.bin", + float_type, 1, 1, 7*7*64, 1024); + void* fc1_bias = readTrainedWeights("../model_params/lenet_keras/fc1_bias.bin", + float_type, 1, 1024, 1, 1); + void* fc2_weights = readTrainedWeights("../model_params/lenet_keras/fc2.bin", + float_type, 1, 1, 1024, 10); + void* fc2_bias = readTrainedWeights("../model_params/lenet_keras/fc2_bias.bin", + float_type, 1, 10, 1, 1); + + + + clearTensorMap(); + + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + for(int i = 0; i < total_runs; i++){ + + if(Opentuner_run){ + + const char* myfifo = "/tmp/myfifo"; + int fd = open(myfifo, O_RDONLY); + + int ret_val = fcntl(fd, F_GETFD); + if(ret_val == -1){ + printf("Invalid descriptor \n"); + abort(); + } + + char str[100]; + read(fd, str, 80); + if(strcmp(str, "stop_run") == 0){ + abort(); + } + + close(fd); + } + + + readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters + + // Start power and performnce profiling + startProfiling(); + profiler.resume_profiler(); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + // NOTE: 'SAME' convolution + void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision); + + // NOTE: For tensorAdd, the only dimension that MUST match is channels + tensorAdd(conv1out, conv1_bias); // NOTE: In place operation + + void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + + void* conv1_tanh = tensorTanh(pool1out); + + // NOTE: input channels have to match between tensor op inputs and outputs + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2_bias); // NOTE: In place operation + + void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void* conv2_tanh = tensorTanh(pool2out); + + void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); + + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + + void* tanh1out = tensorTanh(gemm1biasout); + + void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights); + + void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + + void* tanh2out = tensorTanh(gemm2_biasout); + + void* result = tensorSoftmax(tanh2out); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + std::cout<<"---------------------------------------\n"; + std::cout<<"ITERATION TIME: " << time_energy.first << '\n'; + std::cout<<"---------------------------------------\n"; + + // End profiling and dump output to profile.txt + stopProfiling(); + + computeAccuracy2(labels, test_batch_size, result); + + dumpAccuracyNorms(); + freeOutputTensors(); + + if(Opentuner_run){ + + const char* myfifo = "/tmp/myfifo"; + int fd_out = open(myfifo, O_WRONLY); + int ret_val = fcntl(fd_out, F_GETFD); + if(ret_val == -1){ + printf("Invalid descriptor \n"); + abort(); + } + + const char* str = "completed***!\n\0"; + write(fd_out, str, 80); + close(fd_out); + } + + } + + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testLenetTanh(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..66b7f2a6c4983a8e1f04dfe32f9b599340ea2d05 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc @@ -0,0 +1,435 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + + std::string dir_prefix = std::string("../model_params/mobilenet_quant/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); + std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); + void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); + void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); + void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); + std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); + std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); + void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); + void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); + void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); + std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); + void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); + void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); + void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); + std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); + std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); + void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); + void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); + void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); + std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); + void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); + void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); + void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); + std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); + std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); + void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); + void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); + void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); + std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); + void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); + void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); + void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); + std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); + std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); + void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); + void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); + void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); + std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); + void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); + void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); + void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); + std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); + std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); + void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); + void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); + void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); + std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); + void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); + void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); + void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); + std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); + std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); + void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); + void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); + void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); + std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); + void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); + void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); + void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_7_w_path = dir_prefix + std::string("depthwise_conv2d_7_w.bin"); + void* depthwise_conv2d_7_w = readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); + void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); + void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); + void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); + std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); + void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); + void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); + void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_8_w_path = dir_prefix + std::string("depthwise_conv2d_8_w.bin"); + void* depthwise_conv2d_8_w = readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); + void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); + void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); + void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); + std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); + void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); + void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); + void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_9_w_path = dir_prefix + std::string("depthwise_conv2d_9_w.bin"); + void* depthwise_conv2d_9_w = readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); + void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); + void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); + void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); + std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); + void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); + void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); + void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_10_w_path = dir_prefix + std::string("depthwise_conv2d_10_w.bin"); + void* depthwise_conv2d_10_w = readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); + void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); + void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); + void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); + std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); + void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); + void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); + void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_11_w_path = dir_prefix + std::string("depthwise_conv2d_11_w.bin"); + void* depthwise_conv2d_11_w = readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); + void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); + void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); + void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); + std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); + void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); + void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); + void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); + std::string depthwise_conv2d_12_w_path = dir_prefix + std::string("depthwise_conv2d_12_w.bin"); + void* depthwise_conv2d_12_w = readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); + std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); + void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); + void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); + void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); + std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); + void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); + void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); + void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); + std::string depthwise_conv2d_13_w_path = dir_prefix + std::string("depthwise_conv2d_13_w.bin"); + void* depthwise_conv2d_13_w = readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); + std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); + void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); + void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); + void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); + std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); + void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); + void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); + std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); + void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + + + + startMemTracking(); + startProfiling(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + int total_runs = 100; + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); + + profiler.resume_profiler(); + void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void* var_2 = tensorRelu(var_1); + void* var_4 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void* var_6 = tensorRelu(var_5); + void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void* var_9 = tensorRelu(var_8); + void* var_11 = tensorConvolution(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void* var_13 = tensorRelu(var_12); + void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void* var_16 = tensorRelu(var_15); + void* var_18 = tensorConvolution(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); + void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void* var_20 = tensorRelu(var_19); + void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void* var_23 = tensorRelu(var_22); + void* var_26 = tensorConvolution(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void* var_28 = tensorRelu(var_27); + void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void* var_31 = tensorRelu(var_30); + void* var_33 = tensorConvolution(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void* var_35 = tensorRelu(var_34); + void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void* var_38 = tensorRelu(var_37); + void* var_41 = tensorConvolution(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); + void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void* var_43 = tensorRelu(var_42); + void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); + void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void* var_46 = tensorRelu(var_45); + void* var_48 = tensorConvolution(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); + void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void* var_50 = tensorRelu(var_49); + void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void* var_53 = tensorRelu(var_52); + void* var_55 = tensorConvolution(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); + void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void* var_57 = tensorRelu(var_56); + void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void* var_60 = tensorRelu(var_59); + void* var_63 = tensorConvolution(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); + void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void* var_65 = tensorRelu(var_64); + void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); + void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void* var_68 = tensorRelu(var_67); + void* var_70 = tensorConvolution(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); + void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void* var_72 = tensorRelu(var_71); + void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void* var_75 = tensorRelu(var_74); + void* var_77 = tensorConvolution(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); + void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void* var_79 = tensorRelu(var_78); + void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); + void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void* var_82 = tensorRelu(var_81); + void* var_85 = tensorConvolution(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); + void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void* var_87 = tensorRelu(var_86); + void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); + void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void* var_90 = tensorRelu(var_89); + void* var_92 = tensorConvolution(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); + void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void* var_94 = tensorRelu(var_93); + void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void* var_97 = tensorRelu(var_96); + void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); + void* var_101 = tensorGemmGPU(var_99, dense_1_w); + void* var_102 = tensorAdd(var_101, dense_1_b); + void* var_103 = tensorSoftmax(var_102); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); + + float accuracy = computeAccuracy2(labels, batch_size, var_103); + final_accuracy += accuracy; + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c8d402b78ddd65057e75fadc9acd0e1dd4b6170 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/mobilenet_shallow_profiling.cc @@ -0,0 +1,224 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = std::string("../model_params/mobilenet_shallow/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); + std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); + void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); + void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); + void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); + std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); + std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); + void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); + void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); + void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); + std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); + void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); + void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); + void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); + std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); + std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); + void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); + void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); + void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,1,1); + std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); + void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); + void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); + void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,64,1,1); + std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,64,1,3,3); + std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); + void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); + void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); + void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,64,1,1); + std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); + void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); + void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); + void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); + std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); + std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); + void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); + void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); + void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); + std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); + void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); + void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); + void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); + std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); + std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); + void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); + void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); + void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); + std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); + void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); + void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); + void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + int total_runs = 100; + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); + + profiler.resume_profiler(); + + void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void* var_2 = tensorRelu(var_1); + void* var_4 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void* var_6 = tensorRelu(var_5); + void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void* var_9 = tensorRelu(var_8); + void* var_11 = tensorConvolution(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void* var_13 = tensorRelu(var_12); + void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void* var_16 = tensorRelu(var_15); + void* var_18 = tensorConvolution(var_16, depthwise_conv2d_3_w, 1, 1, 2, 2, 1, 64); + void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void* var_20 = tensorRelu(var_19); + void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void* var_23 = tensorRelu(var_22); + void* var_26 = tensorConvolution(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void* var_28 = tensorRelu(var_27); + void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void* var_31 = tensorRelu(var_30); + void* var_33 = tensorConvolution(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void* var_35 = tensorRelu(var_34); + void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void* var_38 = tensorRelu(var_37); + void* var_40 = tensorPooling(var_38,1,2,2,0,0,2,2); + void* var_42 = tensorGemmGPU(var_40, dense_1_w); + void* var_43 = tensorAdd(var_42, dense_1_b); + void* var_44 = tensorSoftmax(var_43); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); + + float accuracy = computeAccuracy2(labels, batch_size, var_44); + final_accuracy += accuracy; + freeBatchMemory(); + } + } + + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..30a8912ffbe71c69342e80af572db4fe4eea1289 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/resnet18_cifar10_profiling.cc @@ -0,0 +1,243 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = std::string("../model_params/resnet18_cifar10_3/"); + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + int total_runs = 100; + + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + profiler.resume_profiler(); + + void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void* var_3 = tensorAdd(var_2, conv2d_1_b); + void* var_4 = tensorRelu(var_3); + void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void* var_7 = tensorAdd(var_6, conv2d_2_b); + void* var_8 = tensorRelu(var_7); + void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void* var_11 = tensorAdd(var_10, conv2d_3_b); + void* var_12 = tensorAdd(var_4, var_11); + void* var_13 = tensorRelu(var_12); + void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void* var_16 = tensorAdd(var_15, conv2d_4_b); + void* var_17 = tensorRelu(var_16); + void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void* var_20 = tensorAdd(var_19, conv2d_5_b); + void* var_21 = tensorAdd(var_13, var_20); + void* var_22 = tensorRelu(var_21); + void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void* var_25 = tensorAdd(var_24, conv2d_6_b); + void* var_26 = tensorRelu(var_25); + void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void* var_29 = tensorAdd(var_28, conv2d_7_b); + void* var_30 = tensorAdd(var_22, var_29); + void* var_31 = tensorRelu(var_30); + void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); + void* var_34 = tensorAdd(var_33, conv2d_8_b); + void* var_35 = tensorRelu(var_34); + void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void* var_38 = tensorAdd(var_37, conv2d_9_b); + void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); + void* var_41 = tensorAdd(var_40, conv2d_10_b); + void* var_42 = tensorAdd(var_41, var_38); + void* var_43 = tensorRelu(var_42); + void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void* var_46 = tensorAdd(var_45, conv2d_11_b); + void* var_47 = tensorRelu(var_46); + void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void* var_50 = tensorAdd(var_49, conv2d_12_b); + void* var_51 = tensorAdd(var_43, var_50); + void* var_52 = tensorRelu(var_51); + void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void* var_55 = tensorAdd(var_54, conv2d_13_b); + void* var_56 = tensorRelu(var_55); + void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); + void* var_59 = tensorAdd(var_58, conv2d_14_b); + void* var_60 = tensorAdd(var_52, var_59); + void* var_61 = tensorRelu(var_60); + void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); + void* var_64 = tensorAdd(var_63, conv2d_15_b); + void* var_65 = tensorRelu(var_64); + void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); + void* var_68 = tensorAdd(var_67, conv2d_16_b); + void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); + void* var_71 = tensorAdd(var_70, conv2d_17_b); + void* var_72 = tensorAdd(var_71, var_68); + void* var_73 = tensorRelu(var_72); + void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); + void* var_76 = tensorAdd(var_75, conv2d_18_b); + void* var_77 = tensorRelu(var_76); + void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); + void* var_80 = tensorAdd(var_79, conv2d_19_b); + void* var_81 = tensorAdd(var_73, var_80); + void* var_82 = tensorRelu(var_81); + void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); + void* var_85 = tensorAdd(var_84, conv2d_20_b); + void* var_86 = tensorRelu(var_85); + void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); + void* var_89 = tensorAdd(var_88, conv2d_21_b); + void* var_90 = tensorAdd(var_82, var_89); + void* var_91 = tensorRelu(var_90); + void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); + void* var_94 = tensorGemmGPU(var_92, dense_1_w); + void* var_95 = tensorAdd(var_94, dense_1_b); + void* var_96 = tensorSoftmax(var_95); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + std::cout<<"-----------------------"<<time_energy.first<<"----------------------------\n"; + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels,batch_size,var_96); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..160a97cb1437e3c31b82aefc1c055bd562ce48f9 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar100_profiling.cc @@ -0,0 +1,181 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = std::string("../model_params/vgg16_cifar100_front/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); + + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + int total_runs = 100; + Profiler profiler; + profiler.start_profiler(); + double total_time = 0.0; + + for (int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); + + profiler.resume_profiler(); + + void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void* var_1 = tensorAdd(var_0, conv2d_1_b); + void* var_2 = tensorRelu(var_1); + void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void* var_5 = tensorAdd(var_4, conv2d_2_b); + void* var_6 = tensorRelu(var_5); + void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); + void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void* var_9 = tensorAdd(var_8, conv2d_3_b); + void* var_10 = tensorRelu(var_9); + void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void* var_13 = tensorAdd(var_12, conv2d_4_b); + void* var_14 = tensorRelu(var_13); + void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); + void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void* var_17 = tensorAdd(var_16, conv2d_5_b); + void* var_18 = tensorRelu(var_17); + void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void* var_21 = tensorAdd(var_20, conv2d_6_b); + void* var_22 = tensorRelu(var_21); + void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void* var_25 = tensorAdd(var_24, conv2d_7_b); + void* var_26 = tensorRelu(var_25); + void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); + void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void* var_29 = tensorAdd(var_28, conv2d_8_b); + void* var_30 = tensorRelu(var_29); + void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void* var_33 = tensorAdd(var_32, conv2d_9_b); + void* var_34 = tensorRelu(var_33); + void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void* var_37 = tensorAdd(var_36, conv2d_10_b); + void* var_38 = tensorRelu(var_37); + void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); + void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void* var_41 = tensorAdd(var_40, conv2d_11_b); + void* var_42 = tensorRelu(var_41); + void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void* var_45 = tensorAdd(var_44, conv2d_12_b); + void* var_46 = tensorRelu(var_45); + void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void* var_49 = tensorAdd(var_48, conv2d_13_b); + void* var_50 = tensorRelu(var_49); + void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); + void* var_54 = tensorGemmGPU(var_51, dense_1_w); + void* var_55 = tensorAdd(var_54, dense_1_b); + void* var_56 = tensorRelu(var_55); + void* var_58 = tensorGemmGPU(var_56, dense_2_w); + void* var_59 = tensorAdd(var_58, dense_2_b); + void* var_60 = tensorSoftmax(var_59); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); + final_accuracy += accuracy; + freeBatchMemory(); + + } + } + + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d9be320540fa2d1264004f35d16bc358a432413 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/vgg16_cifar10_profiling.cc @@ -0,0 +1,182 @@ +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); + + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + Profiler profiler; + profiler.start_profiler(); + + double total_time = 0.0; + + int total_runs = 100; + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + profiler.resume_profiler(); + + void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void* var_1 = tensorAdd(var_0, conv2d_1_b); + void* var_2 = tensorRelu(var_1); + void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void* var_5 = tensorAdd(var_4, conv2d_2_b); + void* var_6 = tensorRelu(var_5); + void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); + void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void* var_9 = tensorAdd(var_8, conv2d_3_b); + void* var_10 = tensorRelu(var_9); + void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void* var_13 = tensorAdd(var_12, conv2d_4_b); + void* var_14 = tensorRelu(var_13); + void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); + void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void* var_17 = tensorAdd(var_16, conv2d_5_b); + void* var_18 = tensorRelu(var_17); + void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void* var_21 = tensorAdd(var_20, conv2d_6_b); + void* var_22 = tensorRelu(var_21); + void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void* var_25 = tensorAdd(var_24, conv2d_7_b); + void* var_26 = tensorRelu(var_25); + void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); + void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void* var_29 = tensorAdd(var_28, conv2d_8_b); + void* var_30 = tensorRelu(var_29); + void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void* var_33 = tensorAdd(var_32, conv2d_9_b); + void* var_34 = tensorRelu(var_33); + void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void* var_37 = tensorAdd(var_36, conv2d_10_b); + void* var_38 = tensorRelu(var_37); + void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); + void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void* var_41 = tensorAdd(var_40, conv2d_11_b); + void* var_42 = tensorRelu(var_41); + void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void* var_45 = tensorAdd(var_44, conv2d_12_b); + void* var_46 = tensorRelu(var_45); + void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void* var_49 = tensorAdd(var_48, conv2d_13_b); + void* var_50 = tensorRelu(var_49); + void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); + void* var_54 = tensorGemmGPU(var_51, dense_1_w); + void* var_55 = tensorAdd(var_54, dense_1_b); + void* var_56 = tensorRelu(var_55); + void* var_58 = tensorGemmGPU(var_56, dense_2_w); + void* var_59 = tensorAdd(var_58, dense_2_b); + void* var_60 = tensorSoftmax(var_59); + + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy(); + total_time += time_energy.first; + profiler.reset(); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels,batch_size,var_60); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + profiler.stop_profiler(); + + std::cout<<"---------------------------------------\n"; + std::cout<<"Average time: " << total_time / total_runs << '\n'; + std::cout<<"---------------------------------------\n"; + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc index 6074dacf3f56e672ac5ca80eda572a53a58f1044..66e824f6d098434e140d764edda7cdacd11e110f 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc @@ -30,7 +30,7 @@ int main(int argc, char* argv[]){ } - llvm_hpvm_initTensorRt(1); + llvm_hpvm_initTensorRt(0); int missed = 0; for (int i = 0 ; i < total_runs; i++){ @@ -41,7 +41,7 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; + int test_input_size = 2000; int batch_size = 1000; int offset = 5000; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc index 0513723b5a4a36984e736b94ee82b9fc3fb2d1f9..6b951cffcaf142bd917abc7f7c04a2c691c472d7 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc @@ -31,9 +31,9 @@ int main(int argc, char* argv[]){ } - llvm_hpvm_initTensorRt(1); - + llvm_hpvm_initTensorRt(0); + int missed = 0; for (int i = 0 ; i < total_runs; i++){ @@ -43,15 +43,15 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; + int test_input_size = 2000; int batch_size = 1000; int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; - for(int i = 0; i < batch_count; i++){ - - std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/"); + for(int i = 0; i < batch_count; i++){ + + std::string dir_prefix = std::string("../model_params/alexnet_cifar10_test/"); std::string input_path = dir_prefix + std::string("input.bin"); std::string labels_path = dir_prefix + std::string("labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); @@ -79,6 +79,7 @@ int main(int argc, char* argv[]){ std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + int start = i * batch_size + offset; int end = (i + 1) * batch_size + offset; @@ -117,3 +118,4 @@ int main(int argc, char* argv[]){ return 0; } + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc index 1cf73cd92a39a14c6a1fdd3965e63bfabee634b1..052809f29b9d89534005e56125e66c5e4a0bd1cf 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc @@ -43,8 +43,10 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; - int batch_size = 1000; + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; @@ -330,93 +332,93 @@ int main(int argc, char* argv[]){ void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - int start = i * batch_size; - int end = (i + 1) * batch_size; + int start = i * batch_size + offset; + int end = (i + 1) * batch_size + offset; void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); void* var_0 = ConvLayer_PROMISE(input, -1.9892114, 2.126797, conv2d_1_w, -2.196306920051575, 1.347581704139706, NULL, 0, 0, 1, 1, 1, 1, -1, 0, -1, -60.89275047302246, 51.99256916046146, 9); - void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); void* var_2 = tensorRelu(var_1); void* var_3 = tensorConvolution(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); - void* var_4 = tensorBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void* var_4 = tensorHalfBatchNorm(var_3, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); void* var_5 = tensorRelu(var_4); void* var_6 = ConvLayer_PROMISE(var_5, 0.0, 5.713541553974245, conv2d_2_w, -0.9317721160650253, 1.0774258937835774, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.518589503288269, 6.810842518806449, 9); - void* var_7 = tensorBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void* var_7 = tensorHalfBatchNorm(var_6, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); void* var_8 = tensorRelu(var_7); void* var_9 = tensorConvolution(var_8, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); - void* var_10 = tensorBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void* var_10 = tensorHalfBatchNorm(var_9, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); void* var_11 = tensorRelu(var_10); void* var_12 = ConvLayer_PROMISE(var_11, 0.0, 4.932139402866376, conv2d_3_w, -0.5316544661521911, 0.5753790403604531, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.482631235122681, 3.96730119752885, 9); - void* var_13 = tensorBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void* var_13 = tensorHalfBatchNorm(var_12, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); void* var_14 = tensorRelu(var_13); void* var_15 = tensorConvolution(var_14, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); - void* var_16 = tensorBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void* var_16 = tensorHalfBatchNorm(var_15, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); void* var_17 = tensorRelu(var_16); void* var_18 = ConvLayer_PROMISE(var_17, 0.0, 4.103263397693674, conv2d_4_w, -0.36234098821878435, 0.4076913900375366, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.04261828327179, 3.88677932929993, 9); - void* var_19 = tensorBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); void* var_20 = tensorRelu(var_19); void* var_21 = tensorConvolution(var_20, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); - void* var_22 = tensorBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); void* var_23 = tensorRelu(var_22); void* var_24 = ConvLayer_PROMISE(var_23, 0.0, 5.383221302509475, conv2d_5_w, -0.3131200549006462, 0.29357679939270065, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -5.921469215393066, 4.338679324150087, 9); - void* var_25 = tensorBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void* var_25 = tensorHalfBatchNorm(var_24, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); void* var_26 = tensorRelu(var_25); void* var_27 = tensorConvolution(var_26, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); - void* var_28 = tensorBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void* var_28 = tensorHalfBatchNorm(var_27, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); void* var_29 = tensorRelu(var_28); void* var_30 = ConvLayer_PROMISE(var_29, 0.0, 4.316738154411368, conv2d_6_w, -0.23299247801303866, 0.2580290257930756, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.207789947509766, 3.932436970710759, 9); - void* var_31 = tensorBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void* var_31 = tensorHalfBatchNorm(var_30, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); void* var_32 = tensorRelu(var_31); void* var_33 = tensorConvolution(var_32, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); - void* var_34 = tensorBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); void* var_35 = tensorRelu(var_34); void* var_36 = ConvLayer_PROMISE(var_35, 0.0, 5.830408106803901, conv2d_7_w, -0.20233777219057084, 0.18998308175802117, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -6.298286915779113, 4.848135117530843, 9); - void* var_37 = tensorBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); void* var_38 = tensorRelu(var_37); void* var_39 = tensorConvolution(var_38, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); - void* var_40 = tensorBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void* var_40 = tensorHalfBatchNorm(var_39, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); void* var_41 = tensorRelu(var_40); void* var_42 = ConvLayer_PROMISE(var_41, 0.0, 4.446417809963227, conv2d_8_w, -0.17442735651135444, 0.17695830866694454, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.347910885810852, 3.6144364695549145, 9); - void* var_43 = tensorBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void* var_43 = tensorHalfBatchNorm(var_42, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); void* var_44 = tensorRelu(var_43); void* var_45 = tensorConvolution(var_44, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); - void* var_46 = tensorBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void* var_46 = tensorHalfBatchNorm(var_45, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); void* var_47 = tensorRelu(var_46); void* var_48 = ConvLayer_PROMISE(var_47, 0.0, 4.518095604896667, conv2d_9_w, -0.14546796187758446, 0.15256431668996823, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.0287702755928043, 2.9487365779876953, 9); - void* var_49 = tensorBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); void* var_50 = tensorRelu(var_49); void* var_51 = tensorConvolution(var_50, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); - void* var_52 = tensorBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); void* var_53 = tensorRelu(var_52); void* var_54 = ConvLayer_PROMISE(var_53, 0.0, 6.348575634956407, conv2d_10_w, -0.13025874522328376, 0.13558243343234128, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.2293100805282595, 3.5315046372413645, 9); - void* var_55 = tensorBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void* var_55 = tensorHalfBatchNorm(var_54, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); void* var_56 = tensorRelu(var_55); void* var_57 = tensorConvolution(var_56, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); - void* var_58 = tensorBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void* var_58 = tensorHalfBatchNorm(var_57, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); void* var_59 = tensorRelu(var_58); void* var_60 = ConvLayer_PROMISE(var_59, 0.0, 5.221003110408843, conv2d_11_w, -0.11900172759592534, 0.12536374783515936, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.038203780174255, 4.004009407043483, 9); - void* var_61 = tensorBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void* var_61 = tensorHalfBatchNorm(var_60, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); void* var_62 = tensorRelu(var_61); void* var_63 = tensorConvolution(var_62, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); - void* var_64 = tensorBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); void* var_65 = tensorRelu(var_64); void* var_66 = ConvLayer_PROMISE(var_65, 0.0, 5.732498347759442, conv2d_12_w, -0.10839721685647964, 0.11625668607652187, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -3.3111015114784244, 4.462933233261136, 9); - void* var_67 = tensorBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); void* var_68 = tensorRelu(var_67); void* var_69 = tensorConvolution(var_68, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); - void* var_70 = tensorBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_71 = tensorRelu(var_70); + void* var_70 = tensorHalfBatchNorm(var_69, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void* var_71 = tensorHalfRelu(var_70); void* var_72 = ConvLayer_PROMISE(var_71, 0.0, 7.240498211860681, conv2d_13_w, -0.08623744961619377, 0.08859449951350662, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -4.175431394577027, 6.2043294754027345, 9); - void* var_73 = tensorBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_74 = tensorRelu(var_73); + void* var_73 = tensorHalfBatchNorm(var_72, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void* var_74 = tensorHalfRelu(var_73); void* var_75 = tensorConvolution(var_74, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); - void* var_76 = tensorBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void* var_76 = tensorHalfBatchNorm(var_75, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); void* var_77 = tensorRelu(var_76); void* var_78 = ConvLayer_PROMISE(var_77, 0.0, 7.813958834648251, conv2d_14_w, -0.06813025139272214, 0.07002027779817581, NULL, 0, 0, 0, 0, 1, 1, -1, 0, -1, -10.920566423416137, 2.6442912578582534, 9); - void* var_79 = tensorBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_80 = tensorRelu(var_79); - void* var_81 = tensorPooling(var_80,1,2,2,0,0,2,2); + void* var_79 = tensorHalfBatchNorm(var_78, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void* var_80 = tensorHalfRelu(var_79); + void* var_81 = tensorHalfPooling(var_80,1,2,2,0,0,2,2); void* var_82 = FCLayer_PROMISE(var_81, 0.0, 2.8692066650391013, dense_1_w, -0.22301019695401192, 0.1442659378200768, dense_1_b, -0.1654396, 0.23336112, -1, -12.245949958801269, 23.80532513427739, 9); void* var_83 = tensorSoftmax(var_82); diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc index 394ec85390aa4248fd93aefa339ff196f39a5559..42d26d34e65939b410143485a61f23e705906bfc 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc @@ -42,8 +42,10 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; - int batch_size = 1000; + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; @@ -188,8 +190,8 @@ int main(int argc, char* argv[]){ void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - int start = i * batch_size; - int end = (i + 1) * batch_size; + int start = i * batch_size + offset; + int end = (i + 1) * batch_size + offset; void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc index cc0981dc7d1d75ce56388f3135fa0f89f8c688e3..0e5cdd1d284e6c7621cd3331b924c06969be79db 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc @@ -30,7 +30,7 @@ int main(int argc, char* argv[]){ } - llvm_hpvm_initTensorRt(1); + llvm_hpvm_initTensorRt(0); int missed = 0; for (int i = 0 ; i < total_runs; i++){ @@ -41,9 +41,10 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; + int test_input_size = 2000; int batch_size = 1000; int offset = 5000; + int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc index ec5de9a5e2c2d66be44fdd99b83dd634d8f5b2f9..33c68eae84a075f50b2bc8e7484036c54ade5620 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc @@ -31,7 +31,7 @@ int main(int argc, char* argv[]){ } - llvm_hpvm_initTensorRt(1); + llvm_hpvm_initTensorRt(0); int missed = 0; @@ -43,8 +43,10 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; - int batch_size = 1000; + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; @@ -115,8 +117,9 @@ int main(int argc, char* argv[]){ void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); - int start = i * batch_size; - int end = (i + 1) * batch_size; + int start = i * batch_size + offset; + int end = (i + 1) * batch_size + offset; + void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc index 798b5f67aa9636f8e7ad3b9d08b9fc8e53cb137d..ff767235e9d44139f97ad885aa89eef1c385ad33 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc @@ -29,7 +29,7 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } - llvm_hpvm_initTensorRt(1); + llvm_hpvm_initTensorRt(0); int missed = 0; for (int i = 0 ; i < total_runs; i++){ @@ -40,7 +40,7 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 1000; + int test_input_size = 2000; int batch_size = 1000; int offset = 5000; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e33715e8c6972966e7359a1e7b8fc5069e1f16f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/resnet18_cifar10_cudaperf.cc @@ -0,0 +1,221 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main(){ + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = std::string("../model_params/resnet18_cifar10_3/"); + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + + + startMemTracking(); + + int test_input_size = 10000; + int batch_size = 2000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + void* var_2 = tensorConvPerfCuda(input, conv2d_1_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_3 = tensorAdd(var_2, conv2d_1_b); + void* var_4 = tensorRelu(var_3); + void* var_6 = tensorConvPerfCuda(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_7 = tensorAdd(var_6, conv2d_2_b); + void* var_8 = tensorRelu(var_7); + void* var_10 = tensorConvPerfCuda(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_11 = tensorAdd(var_10, conv2d_3_b); + void* var_12 = tensorAdd(var_4, var_11); + void* var_13 = tensorRelu(var_12); + void* var_15 = tensorConvPerfCuda(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_16 = tensorAdd(var_15, conv2d_4_b); + void* var_17 = tensorRelu(var_16); + void* var_19 = tensorConvPerfCuda(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_20 = tensorAdd(var_19, conv2d_5_b); + void* var_21 = tensorAdd(var_13, var_20); + void* var_22 = tensorRelu(var_21); + void* var_24 = tensorConvPerfCuda(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0, 3, 1, 2); + void* var_25 = tensorAdd(var_24, conv2d_6_b); + void* var_26 = tensorRelu(var_25); + void* var_28 = tensorConvPerfCuda(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_29 = tensorAdd(var_28, conv2d_7_b); + void* var_30 = tensorAdd(var_22, var_29); + void* var_31 = tensorRelu(var_30); + void* var_33 = tensorConvPerfCuda(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0, 1, 1, 0); + void* var_34 = tensorAdd(var_33, conv2d_8_b); + void* var_35 = tensorRelu(var_34); + void* var_37 = tensorConvPerfCuda(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_38 = tensorAdd(var_37, conv2d_9_b); + void* var_40 = tensorConvPerfCuda(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0, 1, 1, 0); + void* var_41 = tensorAdd(var_40, conv2d_10_b); + void* var_42 = tensorAdd(var_41, var_38); + void* var_43 = tensorRelu(var_42); + void* var_45 = tensorConvPerfCuda(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0, 3, 1, 0); + void* var_46 = tensorAdd(var_45, conv2d_11_b); + void* var_47 = tensorRelu(var_46); + void* var_49 = tensorConvPerfCuda(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_50 = tensorAdd(var_49, conv2d_12_b); + void* var_51 = tensorAdd(var_43, var_50); + void* var_52 = tensorRelu(var_51); + void* var_54 = tensorConvPerfCuda(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_55 = tensorAdd(var_54, conv2d_13_b); + void* var_56 = tensorRelu(var_55); + void* var_58 = tensorConvPerfCuda(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0, 1, 3, 1); + void* var_59 = tensorAdd(var_58, conv2d_14_b); + void* var_60 = tensorAdd(var_52, var_59); + void* var_61 = tensorRelu(var_60); + void* var_63 = tensorConvPerfCuda(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0, 1, 1, 0); + void* var_64 = tensorAdd(var_63, conv2d_15_b); + void* var_65 = tensorRelu(var_64); + void* var_67 = tensorConvPerfCuda(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_68 = tensorAdd(var_67, conv2d_16_b); + void* var_70 = tensorConvPerfCuda(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0, 3, 1, 2); + void* var_71 = tensorAdd(var_70, conv2d_17_b); + void* var_72 = tensorAdd(var_71, var_68); + void* var_73 = tensorRelu(var_72); + void* var_75 = tensorConvPerfCuda(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_76 = tensorAdd(var_75, conv2d_18_b); + void* var_77 = tensorRelu(var_76); + void* var_79 = tensorConvPerfCuda(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0, 1, 3, 0); + void* var_80 = tensorAdd(var_79, conv2d_19_b); + void* var_81 = tensorAdd(var_73, var_80); + void* var_82 = tensorRelu(var_81); + void* var_84 = tensorConvPerfCuda(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_85 = tensorAdd(var_84, conv2d_20_b); + void* var_86 = tensorRelu(var_85); + void* var_88 = tensorConvPerfCuda(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0, 1, 1, 0); + void* var_89 = tensorAdd(var_88, conv2d_21_b); + void* var_90 = tensorAdd(var_82, var_89); + void* var_91 = tensorRelu(var_90); + void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); + void* var_94 = tensorGemmGPU(var_92, dense_1_w); + void* var_95 = tensorAdd(var_94, dense_1_b); + void* var_96 = tensorSoftmax(var_95); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels,batch_size,var_96); + final_accuracy += accuracy; + + freeBatchMemory(); + } + + stopProfiling(); + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + + llvm_hpvm_cleanupTensorRt(); + + return 0; + +} diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc index bc2d416ce655641c58b304bbc07384c6cada6f8a..dfa411126089849337929c7d9f631cf7e3cd3143 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc @@ -345,15 +345,15 @@ void testTensorGroupedConv(){ void* x3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); // NOTE: Filter descriptors do NOT have batch size // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels) - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); + void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); fillTensorWithOnes(x3); fillTensorWithOnes(filter); int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int conv_groups = 2; - void* conv1 = tensorConvolution(x3, filter, 0, 0, - 1, 1, conv_mode, conv_groups); + void* conv1 = tensorConvolution(x3, filter, 2, 2, + 2, 2, conv_mode, conv_groups); printTensorValues(conv1); // NOTE: For cudnnTensorAdd, the only dimension that MUST match is channels @@ -474,6 +474,38 @@ void testQuantization(){ +void testSampleFilter(){ + + printf("***** Tensor Sample Filter ***** \n\n"); + Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 2, 2, 2); + fillTensorWithVal(input, 3); + + /* float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + host_ptr[0] = -0.1; + host_ptr[1] = -25; + host_ptr[2] = 0.2; + host_ptr[3] = -0.4; + host_ptr[4] = 1.7; + host_ptr[5] = -2.9; + host_ptr[6] = 0.7; + host_ptr[7] = 0.99; + */ + + printTensorValues(input); + + printf("\n\n"); + + hpvm_request_tensor(input, DEVICE); + + sampleFilter(input, 2, 1); + + hpvm_request_tensor(input, HOST); + + printTensorValues(input); +} + + + int main(){ llvm_hpvm_initTensorRt(0); @@ -490,7 +522,7 @@ int main(){ //testTensorConv(); //testTensorGroupedConv(); - testTensorBatchNorm(); + //testTensorBatchNorm(); //testTensorGemm(); //testTensorGemmGPU(); @@ -499,6 +531,10 @@ int main(){ //testTensorConv3(); //testLRN(); + + testSampleFilter(); + + stopProfiling(); return 0; diff --git a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll index 3e48a094b89ac506cf50f712a0d60b1bac95f75d..89c8da90f8ab740062bd84cdd365baa67311a7a4 100644 --- a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll +++ b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll @@ -8,8 +8,8 @@ define void @_Z13dummyFunctionv() #0 { entry: %initRT = alloca i8*, align 8 %cleanRT = alloca i8*, align 8 - %initApproxhpvmRT = alloca i8*, align 8 - %cleaApproxhpvmRT = alloca i8*, align 8 + %initApproxRT = alloca i8*, align 8 + %cleanApproxRT = alloca i8*, align 8 %initRTController = alloca i8*, align 8 %cleanRTController = alloca i8*, align 8 %request_tensorPtr = alloca i8*, align 8 @@ -44,17 +44,18 @@ entry: %ConvLayer = alloca i8*, align 8 %FCLayer = alloca i8*, align 8 %ConvLayer2 = alloca i8*, align 8 + %ConvLayer3 = alloca i8*, align 8 %FCLayer2 = alloca i8*, align 8 %AddWrapper = alloca i8*, align 8 %ReluWrapper = alloca i8*, align 8 %TanhWrapper = alloca i8*, align 8 %BatchNormWrapper = alloca i8*, align 8 %PoolingWrapper = alloca i8*, align 8 - %SoftmaxWrapper = alloca i8*, align 8 + %softmaxWrapper = alloca i8*, align 8 store i8* bitcast (void (i32)* @llvm_hpvm_initTensorRt to i8*), i8** %initRT, align 8 store i8* bitcast (void ()* @llvm_hpvm_cleanupTensorRt to i8*), i8** %cleanRT, align 8 - store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxhpvmRT, align 8 - store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleaApproxhpvmRT, align 8 + store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxRT, align 8 + store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleanApproxRT, align 8 store i8* bitcast (void (i8*, i8*)* @llvm_hpvm_initializeRuntimeController to i8*), i8** %initRTController, align 8 store i8* bitcast (void ()* @llvm_hpvm_clearRuntimeController to i8*), i8** %cleanRTController, align 8 store i8* bitcast (void (i8*, i32)* @hpvm_request_tensor to i8*), i8** %request_tensorPtr, align 8 @@ -89,13 +90,14 @@ entry: store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, i32, i32, i32, i32, i32, i32, float, float, i32)* @ConvLayer_PROMISE to i8*), i8** %ConvLayer, align 8 store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, float, float, i32)* @FCLayer_PROMISE to i8*), i8** %FCLayer, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float)* @wrapper_ConvLayer to i8*), i8** %ConvLayer2, align 8 + store i8* bitcast (i8* (i8*, i8*, i8*, i32, i32, i32, i32, i32, i32)* @wrapper_tensorGroupConvolution to i8*), i8** %ConvLayer3, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, float, float)* @wrapper_FCLayer to i8*), i8** %FCLayer2, align 8 store i8* bitcast (i8* (i8*, i8*, i8*)* @wrapper_tensorAdd to i8*), i8** %AddWrapper, align 8 store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorRelu to i8*), i8** %ReluWrapper, align 8 store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorTanh to i8*), i8** %TanhWrapper, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i8*, i8*, double)* @wrapper_tensorBatchNorm to i8*), i8** %BatchNormWrapper, align 8 store i8* bitcast (i8* (i8*, i8*, i32, i32, i32, i32, i32, i32, i32)* @wrapper_tensorPooling to i8*), i8** %PoolingWrapper, align 8 - store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %SoftmaxWrapper, align 8 + store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %softmaxWrapper, align 8 ret void } @@ -175,6 +177,8 @@ declare i8* @FCLayer_PROMISE(i8*, float, float, i8*, float, float, i8*, float, f declare i8* @wrapper_ConvLayer(i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float) #1 +declare i8* @wrapper_tensorGroupConvolution(i8*, i8*, i8*, i32, i32, i32, i32, i32, i32) #1 + declare i8* @wrapper_FCLayer(i8*, i8*, i8*, i8*, i32, float, float) #1 declare i8* @wrapper_tensorAdd(i8*, i8*, i8*) #1 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py new file mode 100644 index 0000000000000000000000000000000000000000..b8145e179893bc0db2631cf1f7ee0f11bcc9be0e --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/algo_tuner.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python +# +# Algorithmic Approximation Tuning +# Purpose: Tunes for Perforation, Sampling, Numerical Precision (FP16) + + +import adddeps + +import argparse +import opentuner +from opentuner import ConfigurationManipulator +from opentuner import MeasurementInterface +from opentuner import Result +from opentuner import EnumParameter +from opentuner.search.objective import ThresholdAccuracyMinimizeTime +from opentuner.measurement.inputmanager import FixedInputManager +import shutil +import os +import sys +import subprocess +import threading +import psutil + +from measure_confidence2 import dump_promise_confidence_files3 +from measure_confidence2 import getConfidence, getMinAccuracy +from select_top_results import select_top_results +from time import sleep +from pareto_curve import findParetoConfigs + + + + +class TunerData: + def __init__(self): + self.binary_path = "" + self.output_dir = "" + self.num_layers = 0 + self.knobs_list = [] + self.knobs_speedup = {} + self.accuracy_threshold = 0 + self.test_id = 0 + self.layer_costs = [] + self.tuning_flags = [] + self.autotuner_runs = 0 + + + +tunerData = TunerData() + + +def readCostFile(file_path): + + layer_costs = [] + f = open(file_path) + for x in f: + cost = float(x.strip()) + layer_costs.append(cost) + + print ("len(layer_costs) = ", layer_costs) + f.close() + + return layer_costs + + + +def getAccuracy(file_name): + + file = open(file_name, "r") + acc_str = file.read() + file.close() + accuracy = float(acc_str) + + try: + accuracy = float(acc_str) + except: + return 20 + + print (accuracy) + return accuracy + + + +def createFlagsFile(file_name, cfg): + + f = open(file_name, "w+") + cmd_config = "" + for i in range(tunerData.num_layers): # flag in tunerData.tuning_flags: + flag = tunerData.tuning_flags[i] + flag_value = cfg[flag] + cmd_config += str(flag_value) + "\n" + + f.write(cmd_config) + f.close() + + + +def readLayerKnobs(file_path): + + f = open(file_path, "r") + knobs_list = [] + for x in f: + knobs = [] + vals = x.split(",") + for val in vals: + knobs.append(int(val)) + + knobs_list.append(knobs) + + print ("knobs_list = ", knobs_list) + + return knobs_list + + + +def readKnobConfig(file_path): + + knobs_speedup = {} + f = open(file_path, "r") + for x in f: + toks = x.split("\t") + ID = int(toks[0].split(",")[1]) + + speedup = float(toks[2]) + knobs_speedup[ID] = speedup + + print ("knobs_speedup = ", knobs_speedup) + + return knobs_speedup + + + + +def getConfigCost(cfg): + + orig_cost = 0.0 + total_cost = 0.0 + for it in range(tunerData.num_layers): + flag = tunerData.tuning_flags[it] + flag_value = cfg[flag] + op_cost = tunerData.layer_costs[it] + speedup = tunerData.knobs_speedup[flag_value] + + total_cost += (op_cost * 1.0 / speedup * 1.0) + orig_cost += op_cost + + it += 1 + + speedup = (orig_cost * 1.0) / (total_cost * 1.0) + + return total_cost, speedup + + + +def appendTopLine(f_path, accuracy, total_runs, total_comps, speedup): + + f_str = open(f_path, "r").read() + + f_out = open(f_path, "w+") + + f_out.write("total_runs=" + str(total_runs) + "\tconfidence=100.0" + "\tavg_accuracy=" + str(accuracy) + "\tconfig_cost=" + str(total_comps) + "\tspeedup=" + str(speedup) + "\n" ) + f_out.write(f_str) + + f_out.close() + + + + + +class ClangFlagsTuner(MeasurementInterface): + + def __init__(self, args): + objective = ThresholdAccuracyMinimizeTime(tunerData.accuracy_threshold) + input_manager = FixedInputManager(size=tunerData.num_layers) + self.configs_list = [] + + super(ClangFlagsTuner, self).__init__( + args, program_name=args.binary, + program_version=self.file_hash(args.binary), + input_manager=input_manager, objective=objective) + + + + + def manipulator(self): + """ + Define the search space by creating a + ConfigurationManipulator + """ + manipulator = ConfigurationManipulator() + + for i in range(tunerData.num_layers): + tunerData.tuning_flags.append("flag" + str(i)) + + + #for flag in tunerData.tuning_flags: + for ind in range(tunerData.num_layers): + flag = tunerData.tuning_flags[ind] + manipulator.add_parameter( + EnumParameter(flag, tunerData.knobs_list[ind])) + + print ("ind = ", ind, " len = ", len(tunerData.knobs_list)) + print (tunerData.knobs_list[ind]) + ind += 1 + + return manipulator + + + + def run(self, desired_result, input, limit): + + """ + Run a given configuration then + return performance + """ + global test_id + + cfg = desired_result.configuration.data + + # NOTE: creates the file with flags read by the runtime + createFlagsFile("promise_flags", cfg) + + run_cmd = tunerData.binary_path + print "\nbinary_path = ", run_cmd + + + total_runs = 1 # NOTE: Single run sufficient in Algorithmic Approx Tuner + FNULL = open(os.devnull, 'wb') + p = subprocess.Popen([run_cmd, str(total_runs)], stdout = FNULL) + p.wait() + + + accuracy = getAccuracy("final_accuracy") + + # getConfigCost returns the cost associated with the selected configuration + total_comps, speedup = getConfigCost(cfg) + + + Result = opentuner.resultsdb.models.Result() + Result.time = total_comps + #Result.accuracy = accuracy + min_accuracy = getMinAccuracy("run_accuracies.txt") + print ("min_accuracy = ", min_accuracy) + Result.accuracy = min_accuracy + + if min_accuracy > tunerData.accuracy_threshold: + config_tuple = (total_comps, accuracy, cfg) + self.configs_list.append(config_tuple) + f_path = tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id) + shutil.copy('promise_flags', f_path) + + appendTopLine(f_path, accuracy, total_runs, total_comps, speedup) + + f_acc = open(tunerData.output_dir + '/' + tunerData.binary_path + '_' + str(tunerData.test_id) + "_accuracy", "w") + f_acc.write(str(accuracy)) + f_acc.close() + + + tunerData.test_id += 1 + + return Result + + + def save_final_config(self, configuration): + + print "Done with Autotuning Run \n" + sleep(2) + + print "Final configuration", configuration.data + + return + + + + +if __name__ == '__main__': + + argparser = argparse.ArgumentParser(parents=opentuner.argparsers()) + argparser.add_argument('--binary', help='path to target binary') + argparser.add_argument('--num-layers', type=int, help='num of flags to tune') + argparser.add_argument('--accuracy', type=float, help='accuracy threshold') + argparser.add_argument('--result-dir', help='result directory') + argparser.add_argument('--cost-file', help='layer description') + argparser.add_argument('--knobs-config', help='knob settings and ID mapping') + argparser.add_argument('--layer-knobs', help='per-layer Knobs') + + + args = argparser.parse_args() + + tunerData.binary_path = str(args.binary) + tunerData.num_layers = int(args.num_layers) + tunerData.accuracy_threshold = float(args.accuracy) + + + # NOTE: Reading the cost file (with No of ops) to better guide the Autotuner + cost_file_path = args.cost_file + tunerData.layer_costs = readCostFile(cost_file_path) + + + tunerData.knobs_list = readLayerKnobs(args.layer_knobs) + tunerData.knobs_speedup = readKnobConfig(args.knobs_config) + + result_dir = args.result_dir + if result_dir == "": + print("Provide --result-dir ") + + tunerData.output_dir = result_dir + "/high_confidence/" + if not os.path.exists(result_dir): + os.mkdir(result_dir) + + if not os.path.exists(tunerData.output_dir): + print("Creating output directory = ", tunerData.output_dir) + os.mkdir(tunerData.output_dir) + + + + ClangFlagsTuner.main(argparser.parse_args()) + + diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2fc2c9493453f55cb83094373b19a24b59135d4 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt @@ -0,0 +1,6 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..04336fca2708d5e5d78849e1c12014f5ddbd1ad7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt @@ -0,0 +1,6 @@ +11894784.000000 +39321600.000000 +21233664.000000 +28311552.000000 +18874368.000000 +20480.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..063ba473d6a7fa57d7572c86dde9beac0932163d --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt @@ -0,0 +1,7 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a5722f202dde469dca94c71dd9c5fc1cd7aa32b --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt @@ -0,0 +1,7 @@ +88473.601562 +943718.375000 +471859.187500 +943718.375000 +471859.187500 +943718.375000 +2048.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0a974015d74c2a08659deb6e4f664bebbbe83a9 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt @@ -0,0 +1,14 @@ +fp32,11 -1 1.0 tensorConvolution tensorConvolution +fp16,12 -1 1.5 tensorConvolution tensorHalfConvolution +perf,21 1,2,0 2.25 tensorConvolution tensorConvPerfCuda +perf,22 1,2,1 2.25 tensorConvolution tensorConvPerfCuda +perf,23 1,3,0 1.88 tensorConvolution tensorConvPerfCuda +perf,24 1,3,1 1.88 tensorConvolution tensorConvPerfCuda +perf,25 2,1,0 2.25 tensorConvolution tensorConvPerfCuda +perf,26 2,1,1 2.25 tensorConvolution tensorConvPerfCuda +perf,27 3,1,0 1.88 tensorConvolution tensorConvPerfCuda +perf,28 3,1,1 1.88 tensorConvolution tensorConvPerfCuda +samp,31 2,0 2.25 tensorConvolution tensorConvSampSim +samp,32 2,1 2.25 tensorConvolution tensorConvSampSim +samp,33 4,0 1.8 tensorConvolution tensorConvSampSim +samp,34 4,1 1.8 tensorConvolution tensorConvSampSim diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..be1ce58c95981535ec94a7f8badffe967cfed586 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt @@ -0,0 +1,4 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..74b1b668e2f27f3ddb77dcac7fff9890c70a6f02 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt @@ -0,0 +1,4 @@ +62720.000000 +1003520.000000 +321126.406250 +1024.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..6719acb97a58bd7f3d9fbe428f755e13df98b3d0 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..673e704b7e37e19c090e98799189a4411bad9f7c --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt @@ -0,0 +1,28 @@ +88473.601562 +29491.199219 +209715.203125 +14745.599609 +209715.203125 +29491.199219 +419430.406250 +7372.799805 +209715.203125 +14745.599609 +419430.406250 +3686.399902 +209715.203125 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +1843.199951 +209715.203125 +3686.399902 +419430.406250 +1024.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..719d96e48168a477d6edfee1a02b80b554612ec7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt @@ -0,0 +1,8 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..7266441905a08c1ef1796dec8ee6c05660998378 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt @@ -0,0 +1,8 @@ +265420.812500 +629145.625000 +629145.625000 +1258291.250000 +629145.625000 +1258291.250000 +629145.625000 +6144.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7ff033cec2b85390ce6c7667fbbb04837a7eaf9 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt @@ -0,0 +1,22 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdba070cfc5eac559c8384306993fb52a1eb2e04 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt @@ -0,0 +1,22 @@ +44236.800781 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +117964.796875 +235929.593750 +13107.200195 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +117964.796875 +235929.593750 +13107.200195 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +64.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f58ebcc043915d28cf874a1f67e5b2637db1dfc --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt @@ -0,0 +1,15 @@ +88473.601562 +1887436.750000 +943718.375000 +1887436.750000 +943718.375000 +1887436.750000 +1887436.750000 +943718.375000 +1887436.750000 +1887436.750000 +471859.187500 +471859.187500 +471859.187500 +13107.200195 +256.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6daad2e2902e3ac821d99ebbe12e21b6428cc7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt @@ -0,0 +1,15 @@ +884736.000000 +18874368.000000 +9437184.000000 +18874368.000000 +9437184.000000 +18874368.000000 +18874368.000000 +9437184.000000 +18874368.000000 +18874368.000000 +4718592.000000 +4718592.000000 +4718592.000000 +131072.000000 +25600.000000 diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h index 6b38acc6577c1f804ae47d1cb6539b35ea07cf0f..2dc985a0c14ebc18a68d5e54f78bd416f9d3b523 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h @@ -1,5 +1,7 @@ +#include "tensor.h" + extern "C"{ @@ -25,4 +27,18 @@ extern "C"{ void* tensorConvolutionKernelSamp(void* input, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, int skip_every); + + void* tensorConvPerfCuda(void* input, void* filter, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, int row, int col, int start); + + + void sampleFilter(Tensor* filter, int skip_rate, int skip_offset); + + void* tensorConvSampSim(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, + int skip_rate, int skip_offset); } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h index 28a7f8c8fc7cadfed7e25840a2eb9308d5350336..66070f3058d840e4dbe25919e33aa8abc060b330 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h @@ -1,5 +1,24 @@ +#ifndef SIM_HEADER +#define SIM_HEADER + + + +#include "tensor_runtime.h" +#include "tensor_utils.cu" +#include "debug.h" +#include "profiling.h" +#include "fp16_conversion.h" +#include "global_data.h" +#include "error.h" +#include "tensor.h" +#include "op_overheads.h" +#include "half_precision_api.h" +#include "approx_techniques2.h" +#include <unordered_map> + + //N is new_data's size @@ -49,11 +68,11 @@ void postInterpolateCol(int N, int n, int c, int h, int w, float* data, int int_ -// Perforated Tensor Conv with 'perforation_rate' parameter -void* tensorConvPerf2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, int row, int col){ +// A 'Simulation' of perforated tensor convolution +void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, int row, int col){ INFO("*** TensorConvolution \n"); @@ -65,15 +84,14 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr, cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; + if(conv_mode == 0) mode = CUDNN_CONVOLUTION; else if(conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; - // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); @@ -92,8 +110,7 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, vertical_pad, horizontal_pad, // conv padding new_v, new_h, // conv strides @@ -128,7 +145,8 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr, // NOTE: Necessary to insert the above call for every output tensor DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], output->dims.dim_sizes[3]); if(convDesc == NULL || input->tensor_desc == NULL || @@ -136,10 +154,6 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr, ERROR("NULL descriptor! \n"); - // Debugging info prints - printTensorDescInfo(input); - printTensorDescInfo(filter); - printTensorDescInfo(output); // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, @@ -197,10 +211,1120 @@ void* tensorConvPerf2(void* input_ptr, void* filter_ptr, (float *) output->gpu_data, col); - //cudaDeviceSynchronize(); + profileEvent("tensorConv_end", true); + + return output; +} + + + + + +//N is new_data's size +//n, c, h, w are the dimensions of new_data +__global__ +void sampleFilterElems(int N, + int n, int c, int h, int w, + float* data, + int skip_elem, int skip_offset, float mul_factor){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < N; i += stride){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n = i / (c * h * w); + + //int local_index = row * w + col; + int local_index = (ch * (h * w)) + (row * w) + col; + + if(local_index % skip_elem == skip_offset) + data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; + else + data[n * (c * h * w) + ch * (h * w) + row * (w) + col] *= mul_factor; + + } +} + + + + + +void sampleFilter(Tensor* filter, int skip_rate, int skip_offset){ + + int n = filter->dims.dim_sizes[0]; + int c = filter->dims.dim_sizes[1]; + int h = filter->dims.dim_sizes[2]; + int w = filter->dims.dim_sizes[3]; + + int numBlocks = (n * c * h * w + 127) / 128; + int N = n * c * h * w; + float mul_factor = skip_rate / (skip_rate - 1); + + printf ("mul_factor = %f \n", mul_factor); + + sampleFilterElems<<<numBlocks,128>>>(N, + n, c, h, w, + (float *) filter->gpu_data, + skip_rate, skip_offset, mul_factor); + +} + + + +// A 'Simulation' of perforated tensor convolution +void* tensorConvSampSim(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, + int skip_rate, int skip_offset){ + + + INFO("*** TensorConvolution \n"); + profileEvent("tensorConv"); + + Tensor* input = (Tensor*) input_ptr; + Tensor* filter = (Tensor*) filter_ptr; + + + cudnnConvolutionDescriptor_t convDesc; + cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionMode_t mode; + + if(conv_mode == 0) + mode = CUDNN_CONVOLUTION; + else if(conv_mode == 1) + mode = CUDNN_CROSS_CORRELATION; + + float alpha = 1.0f, beta = 0.0f; + + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP32(input); + convertToFP32(filter); + + + // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling + sampleFilter(filter, skip_rate, skip_offset); + + + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); + + //FIXME: Current hack to preserve backward compatibilty + if(conv_groups == 0){ + conv_groups = 1; + } + + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); + + int new_v = vertical_stride + 0; + int new_h = horizontal_stride + 0; + cudnnDataType_t computeType = CUDNN_DATA_FLOAT; + + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode , // mode is configurable + computeType)); // defines compute precision + + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_desc, + filter->filter_desc, + &n, &c, &h, &w)); + + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + Tensor* output; + output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + + if(convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); + + + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, + &convAlgo)); + + + DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); + + + // NOTE: Using GEMM-based Algo + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + convAlgo, + &workspace_size)); + + // Allocating memory for the convolution workspace + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + DEBUG("workspace size = %d \n", workspace_size); + + + checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, + input->gpu_data, filter->filter_desc, filter->gpu_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, output->tensor_desc, output->gpu_data)); + + + profileEvent("tensorConv_end", true); return output; } + + + + + + + + + + +/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ + + +void* PROMISE_Conv(void* input, float i_min, float i_max, + void* filter, float w_min, float w_max, + void* bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing){ + + + Tensor* input_t = (Tensor*) input; + Tensor* filter_t = (Tensor*) filter; + Tensor* bias_t = (Tensor*) bias; + + int orig_type = input_t->cur_type; + + DEBUG("FP32 conversions \n"); + + convertToFP32(input_t); + + convertToFP32(filter_t); + convertToFP32(bias_t); + + DEBUG("DONE FP32 conversions \n"); + + + if(swing < 8){ + input = quantizeTensorPromise(input, i_min, i_max); + filter = quantizeTensorPromise(filter, w_min, w_max); + if(bias != NULL) + bias = quantizeTensorPromise(bias, b_min, b_max); + // aRead error + + input = addPromiseError(input, swing); + } + + + void* conv_out; + conv_out = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + + void* conv_add; + if(bias != NULL){ + conv_add = tensorAdd(conv_out, bias); + } + else{ + conv_add = conv_out; + } + + void* pool_out; + // NOTE: Skip pooling on negative pool sizes + if(pool_size > 0){ + //FIXME: Currently only using MaxPooling + pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); + } + else{ + pool_out = conv_add; + } + + void* activation_out; + switch(activation_id){ + case -1: + activation_out = pool_out; + INFO("NO Activation Function \n"); + break; + case 0: + activation_out = tensorTanh(pool_out); + break; + case 1: + activation_out = tensorRelu(pool_out); + break; + case 2: + activation_out = tensorRelu2(pool_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + if(swing < 8 && activation_id != -1){ + activation_out = quantizeTensorPromise(activation_out, out_min, out_max); + } + + + + //NOTE: Convert back to FP16 if original type + if (orig_type == half_type){ + convertToFP16((Tensor*) activation_out); + } + + + return activation_out; +} + + + +void* PROMISE_FC(void* input, float i_min, float i_max, + void* weights, float w_min, float w_max, + void* bias, float b_min, float b_max, + int activation_id, + float out_min, float out_max, int swing){ + + + Tensor* input_t = (Tensor*) input; + Tensor* weights_t = (Tensor*) weights; + Tensor* bias_t = (Tensor*) bias; + + int orig_type = input_t->cur_type; + + convertToFP32(input_t); + convertToFP32(weights_t); + convertToFP32(bias_t); + + + if(swing < 8){ + input = quantizeTensorPromise(input, i_min, i_max); + weights = quantizeTensorPromise(weights, w_min, w_max); + if(bias != NULL) + bias = quantizeTensorPromise(bias, b_min, b_max); + + // NOTE: Modelling aRead error in PROMISE + input = addPromiseError(input, swing); + } + + + + void* gemm_out; + gemm_out = tensorGemmGPU(input, weights); + + + void* gemmbias_out; + if(bias != NULL){ + gemmbias_out = tensorAdd(gemm_out, bias); + } + else{ + gemmbias_out = gemm_out; + } + + void* activation_out; + switch(activation_id){ + + case -1: + activation_out = gemmbias_out; + INFO("No Activation Function \n"); + break; + case 0: + activation_out = tensorTanh(gemmbias_out); + break; + case 1: + activation_out = tensorRelu(gemmbias_out); + break; + case 2: + activation_out = tensorRelu2(gemmbias_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + if(swing < 8 && activation_id != -1){ + activation_out = quantizeTensorPromise(activation_out, out_min, out_max); + } + + + //NOTE: Convert back to FP16 if original type + if (orig_type == half_type){ + convertToFP16((Tensor*) activation_out); + } + + + + return activation_out; +} + + + + + +// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper +//#define OLD_MODEL + +#ifndef OLD_MODEL + + + +bool isPromise(int swing){ + + if(swing < 8) + return true; + else + return false; +} + + +bool isFullPrecision(int swing){ + + if(swing == 11) + return true; + else + return false; +} + + + +bool isHalfPrecision(int swing){ + + if(swing == 12) + return true; + else + return false; +} + + +bool isPerforation(int swing){ + + if(swing >= 21 && swing <= 29) + return true; + else + return false; +} + + +bool isSampling(int swing){ + + if(swing >= 31 && swing <= 39) + return true; + else + return false; +} + + +int getSwing(int swing){ + + #ifdef PROMISE_TUNER_ENABLED + + // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime + if(!approxhpvm_runtime_mode){ + + if(op_counter >= total_ops){ + ERROR("No accuracy flag found \n"); + } + + swing = op_accuracies[op_counter]; + op_counter++; + } + + #endif + + DEBUG("---- swing_value = %d \n", swing); + + return swing; +} + + + + + + +class PerfParams{ + + public: + int row; + int col; + int skip_offset; + + PerfParams(){ + row = 1; + col = 1; + skip_offset = 0; + } + + PerfParams(int row1, int col1, int skip_offset1){ + row = row1; + col = col1; + skip_offset = skip_offset1; + } + +}; + + + +PerfParams getPerfParams(int swing){ + + std::map<int, PerfParams> perf_knob_map; + + PerfParams params21(1, 2, 0); + perf_knob_map[21] = params21; + + PerfParams params22(1, 2, 1); + perf_knob_map[22] = params22; + + PerfParams params23(1, 3, 0); + perf_knob_map[23] = params23; + + PerfParams params24(1, 3, 1); + perf_knob_map[24] = params24; + + PerfParams params25(2, 1, 0); + perf_knob_map[25] = params25; + + PerfParams params26(2, 1, 1); + perf_knob_map[26] = params26; + + PerfParams params27(3, 1, 0); + perf_knob_map[27] = params27; + + PerfParams params28(3, 1, 1); + perf_knob_map[28] = params28; + + + return perf_knob_map[swing]; + +} + + + + +class SampParams{ + + public: + int skip_rate; + int skip_offset; + + SampParams(){ + skip_rate = 1; + skip_offset = 0; + } + + SampParams(int skip_rate1, int skip_offset1){ + skip_rate = skip_rate1; + skip_offset = skip_offset1; + } + +}; + + + +SampParams getSampParams(int swing){ + + std::map<int, SampParams> samp_knob_map; + + SampParams params31(2, 0); + samp_knob_map[31] = params31; + + SampParams params32(2, 1); + samp_knob_map[32] = params32; + + SampParams params33(4, 0); + samp_knob_map[33] = params33; + + SampParams params34(4, 1); + samp_knob_map[34] = params34; + + return samp_knob_map[swing]; + +} + + + + + + +/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */ + +void* ConvLayer_PROMISE(void* input, float i_min, float i_max, + void* filter, float w_min, float w_max, + void* bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing){ + + if(ONLINE_PROFILING){ + ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n"); + } + + + swing = getSwing(swing); + + if(isPromise(swing)){ + + return PROMISE_Conv(input, i_min, i_max, + filter, w_min, w_max, + bias, b_min, b_max, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + pool_id, pool_size, + activation_id, + out_min, out_max, swing); + } + + + + void* conv_out; + if(isPerforation(swing)){ + + PerfParams params = getPerfParams(swing); + DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n", + params.row, params.col, params.skip_offset); + + conv_out = tensorConvPerfCuda(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.row, params.col, params.skip_offset); + + } + + if(isSampling(swing)){ + + SampParams params = getSampParams(swing); + DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", + params.skip_rate, params.skip_offset); + + conv_out = tensorConvSampSim(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.skip_rate, params.skip_offset); + + } + + + if (isHalfPrecision(swing)){ + + conv_out = tensorHalfConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + } + + if (isFullPrecision(swing)){ + conv_out = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + } + + + void* conv_add; + if(bias != NULL){ + if( !isFullPrecision(swing) ){ + conv_add = tensorHalfAdd(conv_out, bias); + } + else{ + conv_add = tensorAdd(conv_out, bias); + } + } + else{ + conv_add = conv_out; + } + + void* pool_out; + if(pool_size > 0){ + //FIXME: Currently only using MaxPooling + pool_out = tensorHalfPooling(conv_add, 0, pool_size, pool_size, + 0, 0, pool_size, pool_size); + } + else{ + pool_out = conv_add; + } + + void* activation_out; + switch(activation_id){ + case -1: + activation_out = pool_out; + INFO("NO Activation Function \n"); + break; + case 0: + activation_out = tensorHalfTanh(pool_out); + break; + case 1: + activation_out = tensorHalfRelu(pool_out); + break; + case 2: + activation_out = tensorHalfRelu2(pool_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + return activation_out; +} + + +void* FCLayer_PROMISE(void* input, float i_min, float i_max, + void* weights, float w_min, float w_max, + void* bias, float b_min, float b_max, + int activation_id, + float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu' + + + swing = getSwing(swing); + + if(isPromise(swing)){ + + return PROMISE_FC(input, i_min, i_max, + weights, w_min, w_max, + bias, b_min, b_max, + activation_id, + out_min, out_max, swing); + } + + + + void* gemm_out; + if(!isFullPrecision(swing)){ + gemm_out = tensorHalfGemm(input, weights); + } + else{ + gemm_out = tensorGemmGPU(input, weights); + } + + + void* gemmbias_out; + if(bias != NULL){ + // Swing 8 corresponds to FP32 + if(!isFullPrecision(swing)){ + gemmbias_out = tensorHalfAdd(gemm_out, bias); + } + else{ + gemmbias_out = tensorAdd(gemm_out, bias); + } + } + else{ + gemmbias_out = gemm_out; + } + + void* activation_out; + switch(activation_id){ + + case -1: + activation_out = gemmbias_out; + INFO("No Activation Function \n"); + break; + case 0: + activation_out = tensorHalfTanh(gemmbias_out); + break; + case 1: + activation_out = tensorHalfRelu(gemmbias_out); + break; + case 2: + activation_out = tensorHalfRelu2(gemmbias_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + + return activation_out; +} + +#endif + + + +#ifdef OLD_MODEL + +#endif + +#endif + + + +/************* NOTE: Outdated PROMISE routines - Used for Comparison ****/ + + + + +/* + + + +void* ConvLayer_PROMISE(void* input, float i_min, float i_max, + void* filter, float w_min, float w_max, + void* bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing){ + + + DEBUG("\n\n**** NOTE: Conv OLD MODEL *** \n\n"); + + #ifdef PROMISE_TUNER_ENABLED + + // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime + if(!approxhpvm_runtime_mode){ + + if(op_counter >= total_ops){ + ERROR("No accuracy flag found \n"); + } + + swing = op_accuracies[op_counter]; + op_counter++; + } + + #endif + + + if (swing < 0 || swing > 20){ + ERROR("Incorrect swing value"); + } + + + + if(swing < 8){ + input = quantizeTensorPromise(input, i_min, i_max); + filter = quantizeTensorPromise(filter, w_min, w_max); + if(bias != NULL) + bias = quantizeTensorPromise(bias, b_min, b_max); + // aRead error + + input = addPromiseError(input, swing); + } + + + void* conv_out; + if(swing == 8 || (swing >= 12 && swing <= 15) ){ + //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, + // conv_stride_h, conv_stride_w, 1, 1, 1, 0); + + int rows = 2; + switch(swing){ + + case 12: rows = 5; break; + case 13: rows = 4; break; + case 14: rows = 3; break; + case 15: rows = 2; break; + + default: rows = 2; break; + } + + conv_out = tensorConvPerfSim(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, rows, 0); + + /*void* gold = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + + Norm_t* norms = calculateNormsTreeReduction((struct Tensor*) conv_out, (struct Tensor*) gold); + + DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); + */ + +/*------------- + } + else if(swing == 9 || (swing >= 16 && swing <= 19) ){ + //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, + // conv_stride_h, conv_stride_w, 1, 1, 0, 1); + + + int cols = 2; + switch(swing){ + + case 16: cols = 5; break; + case 17: cols = 4; break; + case 18: cols = 3; break; + case 19: cols = 2; break; + + default: cols = 2; break; + } + + + conv_out = tensorConvPerfSim(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 0, cols); + + + /*void* gold = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + + Norm_t* norms = calculateNormsTreeReduction((struct Tensor*)conv_out, (struct Tensor*) gold); + + DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); + */ + +/*----- + + } + else if(swing == 10){ + conv_out = tensorHalfConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + } + else{ + conv_out = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + } + + void* conv_add; + if(bias != NULL){ + if(swing >= 8){ + conv_add = tensorHalfAdd(conv_out, bias); + } + else{ + conv_add = tensorAdd(conv_out, bias); + } + } + else{ + conv_add = conv_out; + } + + void* pool_out; + // NOTE: Skip pooling on negative pool sizes + if(pool_size > 0){ + //FIXME: Currently only using MaxPooling + pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); + } + else{ + pool_out = conv_add; + } + + void* activation_out; + switch(activation_id){ + case -1: + activation_out = pool_out; + INFO("NO Activation Function \n"); + break; + case 0: + activation_out = tensorTanh(pool_out); + break; + case 1: + activation_out = tensorRelu(pool_out); + break; + case 2: + activation_out = tensorHalfRelu2(pool_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + if(swing < 8 && activation_id != -1){ + activation_out = quantizeTensorPromise(activation_out, out_min, out_max); + } + + return activation_out; +} + + +void* FCLayer_PROMISE(void* input, float i_min, float i_max, + void* weights, float w_min, float w_max, + void* bias, float b_min, float b_max, + int activation_id, + float out_min, float out_max, int swing){ + + + + #ifdef PROMISE_TUNER_ENABLED + + // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime + if(!approxhpvm_runtime_mode){ + + if(op_counter >= total_ops){ + ERROR("No accuracy flag found \n"); + } + + swing = op_accuracies[op_counter]; + op_counter++; + } + + #endif + + + if (swing < 0 || swing > 20){ + ERROR("Incorrect swing value"); + } + + if(swing < 8){ + input = quantizeTensorPromise(input, i_min, i_max); + weights = quantizeTensorPromise(weights, w_min, w_max); + if(bias != NULL) + bias = quantizeTensorPromise(bias, b_min, b_max); + + // NOTE: Modelling aRead error in PROMISE + input = addPromiseError(input, swing); + } + + + + void* gemm_out; + if(swing >= 8 && swing < 11){ + gemm_out = tensorHalfGemm(input, weights); + } + else{ + gemm_out = tensorGemmGPU(input, weights); + } + + + void* gemmbias_out; + if(bias != NULL){ + // Swing 8 corresponds to FP32 + if(swing >= 8 && swing < 20){ + gemmbias_out = tensorHalfAdd(gemm_out, bias); + } + else{ + gemmbias_out = tensorAdd(gemm_out, bias); + } + } + else{ + gemmbias_out = gemm_out; + } + + void* activation_out; + switch(activation_id){ + + case -1: + activation_out = gemmbias_out; + INFO("No Activation Function \n"); + break; + case 0: + activation_out = tensorTanh(gemmbias_out); + break; + case 1: + activation_out = tensorRelu(gemmbias_out); + break; + case 2: + activation_out = tensorRelu2(gemmbias_out, out_min, out_max); + break; + default: + ERROR("Activation id %d NOT supported \n", activation_out); + break; + } + + + if(swing < 8 && activation_id != -1){ + activation_out = quantizeTensorPromise(activation_out, out_min, out_max); + } + + return activation_out; +} + +#endif + + + + + + +#endif + + + + + + + + /*void* gold = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + + Norm_t* norms = calculateNormsTreeReduction((struct Tensor*) conv_out, (struct Tensor*) gold); + + DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); + */ + + + /*void* gold = tensorConvolution(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, + 1, 0); + + Norm_t* norms = calculateNormsTreeReduction((struct Tensor*)conv_out, (struct Tensor*) gold); + + DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); + */ + + + + + + + /*#ifdef PROMISE_TUNER_ENABLED + + // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime + if(!approxhpvm_runtime_mode){ + + if(op_counter >= total_ops){ + ERROR("No accuracy flag found \n"); + } + + swing = op_accuracies[op_counter]; + op_counter++; + } + + #endif + + */ + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h index af0ed1e202017dde2cb96e9f8798aff1219c0695..9689c6fce91d3a4093d91b5006ef1beee969f8eb 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h @@ -819,7 +819,7 @@ __global__ void depthwise_conv4_half3(__half* const __restrict__ y, __half t1; - int total = C_dim * H_dim * W_dim; + //int total = C_dim * H_dim * W_dim; t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim + start_w + q - bstart_w]; @@ -920,7 +920,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups){ - llvm_hpvm_initTensorRt(0); INFO("*** TensorConvolution \n"); profileEvent("Conv"); @@ -935,7 +934,13 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, Tensor* output; - + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP32(input); + convertToFP32(filter); + + if (conv_groups > 32) { // TODO: Support other cases; hostToDeviceCopy(input); @@ -949,7 +954,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device @@ -957,33 +962,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, // NOTE: Necessary to insert the above call for every output tensor - /* - if (c > 255) { - dim3 grid((n / 16), c); - dim3 block(h * w); - depthwise_conv << <grid, block >> > ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - - }*/ - - /* - dim3 grid((n / 12), c); - dim3 block(h * w); - depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - if(n % 12 > 0){ - dim3 grid2((n % 12), c); - dim3 block(h * w); - depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); - } - */ int blockSize; blockSize = 64; @@ -994,7 +972,8 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, (float*)input->gpu_data, (float*)filter->gpu_data, input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); } else { @@ -1043,11 +1022,11 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NHWC, n, h, w, c); } else @@ -1137,6 +1116,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, } +// FIXME: Need to properly fix the new HALF type conversion void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, @@ -1165,6 +1145,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); + convertToFP16(input); + convertToFP16(filter); + /***** CONVERSIONS from FP32 to FP16 - on the GPU */ size_t* input_dims = input->dims.dim_sizes; @@ -1209,7 +1192,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // FIXIT: more checks for data types needed output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF, @@ -1797,7 +1780,7 @@ void* tensorConvPerf(void* input_ptr, void* filter_ptr, Tensor* new_output; if(input->data_format == CUDNN_TENSOR_NCHW) - new_output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if(input->data_format == CUDNN_TENSOR_NHWC){ DEBUG("* NHWC Format \n"); @@ -2078,3 +2061,32 @@ void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr, #endif return output; } + + + /* + if (c > 255) { + dim3 grid((n / 16), c); + dim3 block(h * w); + depthwise_conv << <grid, block >> > ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + + }*/ + + /* + dim3 grid((n / 12), c); + dim3 block(h * w); + depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + if(n % 12 > 0){ + dim3 grid2((n % 12), c); + dim3 block(h * w); + depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); + } + */ diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h new file mode 100644 index 0000000000000000000000000000000000000000..a81ffe296233178126555bbb53babdcd4192a7bf --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -0,0 +1,352 @@ + +#include "tensor_utils.cu" + + + +//This skips every xth row +//H_eff is the number of rows calculated exactly +__global__ +void convToGemmPerfRow(float * const __restrict__ output, + const float * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int x, const int start, const int H_eff){ + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_eff * W_out); //output image number + const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number + const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + int past_start = (h % (x - 1) >= (x - 1 - start)); + const int inH = (h / (x - 1) * x + h % (x-1) + + past_start) * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = 0; + + } + } + } + +} + + +//For use in tensorConvPerfCuda +//Interpolates every xth row starting from x - 1 - start +//N is total number of elements in final output array +__global__ +void approxInterpolateRow(int N, int old_h, int n, int c, int h, int w, + float *old_data, float *new_data, int x, int start){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < N; i += stride){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n = i / (c * h * w); + int past_start = ((row % x) >= (x - 1 - start)); + + if(row == h-1) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; + else if (row == 0) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + else if(row % x == x - 1 - start){ + int past_startO = ((row - 1) % x) > (x - 1 - start); + int oldIdx1 = n * (c * old_h * w) + ch * (old_h * w) + + ((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col; + + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2; + } + else + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + + ((x-1) * (row / x) + row % x - past_start ) * (w) + col]; + + + } + +} + + +//This skips every xth row +//W_eff is the number of cols calculated exactly +__global__ +void convToGemmPerfCol(float * const __restrict__ output, + const float * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int x, const int start, const int W_eff){ + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_eff); //output image number + const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number + const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) + const int w = tx % W_eff; //output width index (col number) + int past_start = (w % (x - 1)) >= (x - 1 - start); + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = (w / (x - 1) * x + w % (x-1) + + past_start) * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; + + } + } + } + +} + + +//For use in tensorConvPerfCuda +//Interpolates every xth col starting from x - 1 - start +//N is total number of elements in final output array +__global__ +void approxInterpolateCol(int N, int old_w, int n, int c, int h, int w, + float *old_data, float *new_data, int x, int start){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < N; i += stride){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n = i / (c * h * w); + int past_start = ((col % x) >= (x - 1 - start)); + + if(col == w-1) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + else if (col == 0) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + else if(col % x == x - 1 - start){ + int past_startO = ((col - 1) % x) > (x - 1 - start); + int oldIdx1 = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + + ((x-1) * ((col - 1) / x) + (col-1) % x - past_startO); + + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2; + } + else + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + + ((x-1) * (col / x) + col % x - past_start)]; + + } + +} + + + +//start has to be less than row or less than col +//row and col have to be >= 0 +//row = col = 1 means no perforation +void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start){ + + INFO("*** TensorConvolution (output perforation) \n"); + profileEvent("Conv"); + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } + + Tensor* output; + // TODO: Support other cases; + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + + convertToFP32(input); + convertToFP32(filter); + + + int n, c, h, w; // output dimensions + n = input->dims.dim_sizes[0]; + c = filter->dims.dim_sizes[0]; //number of filters + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + int h_eff = h - h / row; + if(h % row > row - 1 - start) + h_eff = h_eff - 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + int w_eff = w - w / col; + if(w % col > col - 1 - start) + w_eff = w_eff - 1; + + + Tensor *new_output; + if(row > 1){ + output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + float * convData; + int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 128; + const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, + vertical_stride, horizontal_stride, row, start, h_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h_eff * w, c, num_filter_elem, + &alpha, + convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, + &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, + n)); + + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(new_output, DEVICE); + + //interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, + (float *) output->gpu_data, (float *) new_output->gpu_data, + row, start); + cudaDeviceSynchronize(); + + cudaFree(output); + cudaFree(convData); + } + else if(col > 1){ + + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + float * convData; + int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 128; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, + vertical_stride, horizontal_stride, col, start, w_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w_eff, c, num_filter_elem, + &alpha, + convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, + &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, + n)); + + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(new_output, DEVICE); + + //interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w, + (float *)output->gpu_data, (float *)new_output->gpu_data, + col, start); + cudaDeviceSynchronize(); + + cudaFree(output); + cudaFree(convData); + } + else{ + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + float * convData; + int convDataSize = sizeof(float) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 128; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApprox<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, + vertical_stride, horizontal_stride, num_filter_elem, c * h * w); + checkCudaErrors(cudaDeviceSynchronize()); + //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w, c, num_filter_elem, + &alpha, + convData, h * w, num_filter_elem * h * w, + (float *)filter->gpu_data, num_filter_elem, 0, + &beta, + (float *)output->gpu_data, h * w, c * h * w, + n)); + + new_output = output; + cudaFree(convData); + } + + + profileEvent("Conv_end", true); + + + return new_output; +} diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index b482cef5377e0f879b43f06a7ebbfbe01b39be09..14dc8f20f2111e85e82630cdbcc0c695a39c5ecd 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -72,7 +72,9 @@ public: FP32, FP16, PERFORATION, -// INPUT_SAMPLING, + INPUT_SAMPLING, + REDUCTION_SAMPLING, +// ADDITIONAL_APPROXIMATION_METHOD APPROX_END }; @@ -91,6 +93,15 @@ public: POOL_MEAN, POOL_MIN, SOFTMAX, + FFT, + REDUCE, + PROJECTIVE_T, + MAP1, + MAP2, + MAP3, +// STENCIL, +// COSINE_T, +// ADDITIONAL_TENSOR_OPERATION TENSOR_OP_END }; @@ -269,6 +280,24 @@ void GPUNodeConfiguration::print() { case TENSOR_OP::SOFTMAX : DEBUG("softmax"); break; + case TENSOR_OP::FFT : + DEBUG("fft"); + break; + case TENSOR_OP::REDUCE : + DEBUG("reduce"); + break; + case TENSOR_OP::PROJECTIVE_T : + DEBUG("projectiveT"); + break; + case TENSOR_OP::MAP1 : + DEBUG("map1"); + break; + case TENSOR_OP::MAP2 : + DEBUG("map2"); + break; + case TENSOR_OP::MAP3 : + DEBUG("map3"); + break; default : ERROR("Unknown tensor operation."); break; @@ -288,6 +317,12 @@ void GPUNodeConfiguration::print() { case APPROX::PERFORATION : DEBUG("perf"); break; + case APPROX::INPUT_SAMPLING : + DEBUG("input_samp"); + break; + case APPROX::REDUCTION_SAMPLING : + DEBUG("red_samp"); + break; default: ERROR("Unknown approximation option"); break; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h index 9fd4a578318afe3d9f85097474396a351900354b..e2e78f1d10c048d73755df73d553b3932ab72d24 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h @@ -422,7 +422,7 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ norms->mean_l2 = mean_l2; norms->orig_inf_norm = 0.0; - // Relative metrics (relative to distribution) - suitable for PROMISE + // Relative metrics (relative to distribution) norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; norms->inf_norm = 0.0; @@ -797,11 +797,11 @@ void* addGaussianError(void* x_ptr, int error_scale){ Tensor* x = (Tensor*) x_ptr; size_t* dim_sizes = x->dims.dim_sizes; - Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format, + Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format, dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); - Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format, + Tensor* x_original = (Tensor*) create4DTensor(x->cur_type, x->data_format, dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); @@ -876,6 +876,7 @@ void initPromiseRandValues(Tensor* bias, int error_scale){ } +// NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16 // Routine for Adding PROMISE bitline swing error void* addPromiseError(void* x_ptr, int error_scale){ @@ -889,7 +890,7 @@ void* addPromiseError(void* x_ptr, int error_scale){ Tensor* x = (Tensor*) x_ptr; size_t* dim_sizes = x->dims.dim_sizes; - Tensor* bias = (Tensor*) create4DTensor(x->data_type, x->data_format, + Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format, dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); @@ -955,6 +956,7 @@ void* quantizeTensorPromise(void* input_ptr, float min, float max){ INFO("QuantizeTensorPROMISE \n"); Tensor* input = (Tensor*) input_ptr; + int quantize_range = 256; float input_range = max - min; @@ -967,8 +969,10 @@ void* quantizeTensorPromise(void* input_ptr, float min, float max){ hostToDeviceCopy(input); - quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data, input->num_elems, mul_factor, min, max); + quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data, + input->num_elems, mul_factor, min, max); + return input; } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h index 252427c65379aa977237652eb4435e685dbc3403..4c2fbe806d1758118f6d55c079f9c75de42599d8 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/fp16_conversion.h @@ -31,6 +31,12 @@ // // It is recommended to use the more robust versions in production code. + +#ifndef FP16_CONV_HEADER +#define FP16_CONV_HEADER + + + typedef unsigned uint; union FP32 @@ -111,4 +117,8 @@ static float half_to_float(half hf) o.u |= (h.u & 0x8000) << 16; // sign bit return o.f; -} \ No newline at end of file +} + + + +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h index cc1deccd058b02d1d2db6ef58c9be4ca48589231..230cb31f4de4740428737e52ad2834908566a07b 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h @@ -48,6 +48,8 @@ std::vector<void*> tensors_ptr; std::vector<void*> host_ptr; std::vector<void*> obj_ptr; +std::unordered_map<void*, int> tracked_tensors; + // Autotuning data std::unordered_map<int, int> skip_tensors; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h index f13e82f3aecf08c757341dda35d86a81b542180d..94e1a635b5a6baec9fec6c91509caee5cf287e01 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/half_precision_api.h @@ -1,4 +1,9 @@ + +#ifndef HALF_API_HEADER +#define HALF_API_HEADER + + #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -578,7 +583,6 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ hostToDeviceCopy(x); hostToDeviceCopy(bias); - size_t* x_dims = x->dims.dim_sizes; //**** Data conversion from float to half profileEvent("F2H_start"); @@ -611,3 +615,4 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h index 911f42b955a72cb756aadc1fc78231187ef3394e..21c6df7f1749e891dba257bbb1933c3beefb8c4f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h @@ -735,6 +735,30 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG ("Found softmax operation\n"); NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX); idx++; + } else if (tokens[idx] == "fft") { + DEBUG ("Found fft operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT); + idx++; + } else if (tokens[idx] == "reduce") { + DEBUG ("Found reduce operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE); + idx++; + } else if (tokens[idx] == "projectiveT") { + DEBUG ("Found projectiveT operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T); + idx++; + } else if (tokens[idx] == "map1") { + DEBUG ("Found map1 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1); + idx++; + } else if (tokens[idx] == "map2") { + DEBUG ("Found map2 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2); + idx++; + } else if (tokens[idx] == "map3") { + DEBUG ("Found map3 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3); + idx++; } else /*Not a new operation. This means an approximation option*/ if (tokens[idx] == "fp32") { DEBUG("Found fp32 option\n"); @@ -756,6 +780,18 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("perf parameter: %d\n", perf); NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf); idx += 2; + } else if (tokens[idx] == "input_samp") { + DEBUG("Found input_samp option\n"); + int input_samp = std::stoi(tokens[idx+1]); + DEBUG("input_samp parameter: %d\n", input_samp); + NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, input_samp); + idx += 2; + } else if (tokens[idx] == "red_samp") { + DEBUG("Found red_samp option\n"); + int red_samp = std::stoi(tokens[idx+1]); + DEBUG("red_samp parameter: %d\n", red_samp); + NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp); + idx += 2; } // TODO: other approximation options handled here diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu index 9d4b7e0d03b766db8b4a0e0d5c1273bdd4ee74d8..282a0cbb68de4f033b46cdc5c4a8ad69aa1f20c0 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu @@ -84,6 +84,8 @@ void freeTensor(void* tensor_ptr){ cudaFree(tensor->gpu_data); + cudaFree(tensor->gpu_half_data); + tensor->gpu_data = NULL; free(tensor->host_data); tensor->host_data = NULL; @@ -137,6 +139,10 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ } tensors_ptr.push_back(tensor->gpu_data); + tensors_ptr.push_back(tensor->gpu_half_data); + + tracked_tensors[tensor] = 1; // For FP16-FP32 data handling + host_ptr.push_back(tensor->host_data); obj_ptr.push_back(tensor); //host_ptr.push_back(tensor->host_data); @@ -323,12 +329,18 @@ extern "C"{ void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){ Tensor* tensor = (Tensor*) tensor_ptr; - - if(tensor->size_in_bytes != size_in_bytes){ + + size_t host_size_in_bytes = tensor->num_elems * 4; + //if(tensor->size_in_bytes != size_in_bytes){ + if(host_size_in_bytes != size_in_bytes){ ERROR("The destination and source sizes don't match"); } std::memcpy(tensor->host_data, data_ptr, size_in_bytes); + + changeTensorPlacement(tensor, HOST); + + tensor->cur_type = float_type; } @@ -422,10 +434,13 @@ extern "C"{ -bool ONLINE_PROFILING = false; +bool ONLINE_PROFILING = false; // true; void convertToFP16(struct Tensor* tensor){ + + if(tensor == NULL) + return; printf("**** cur_type = %d , half_type = %d \n", tensor->cur_type, half_type); @@ -443,7 +458,10 @@ void convertToFP16(struct Tensor* tensor){ if(tensor->gpu_half_data == NULL) checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU - tensors_ptr.push_back(tensor->gpu_half_data); + + // If Tensor is one of Tracked (has to free per batch) then track all data types + if(tracked_tensors.find(tensor) != tracked_tensors.end()) + tensors_ptr.push_back(tensor->gpu_half_data); f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data); @@ -454,6 +472,9 @@ void convertToFP16(struct Tensor* tensor){ void convertToFP32(struct Tensor* tensor){ + if(tensor == NULL) + return; + // Need this check for both offline and online profiling path if (tensor->cur_type == float_type) return; @@ -468,7 +489,12 @@ void convertToFP32(struct Tensor* tensor){ checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - + + + // If Tensor is one of Tracked (has to free per batch) then track all data types + if(tracked_tensors.find(tensor) != tracked_tensors.end()) + tensors_ptr.push_back(tensor->gpu_data); + h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); tensor->cur_type = float_type; @@ -479,6 +505,9 @@ void convertToFP32(struct Tensor* tensor){ void convertToFP32_offline(struct Tensor* tensor){ + if(tensor == NULL) + return; + if(ONLINE_PROFILING){ return; } @@ -493,7 +522,11 @@ void convertToFP32_offline(struct Tensor* tensor){ checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - + + // If Tensor is one of Tracked (has to free per batch) then track all data types + if(tracked_tensors.find(tensor) != tracked_tensors.end()) + tensors_ptr.push_back(tensor->gpu_data); + h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); tensor->cur_type = float_type; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index eb7433afc59a862fbd6e7e0d7d153eb8080f459b..9e58f36a402844c33a1cb665ae4113e6e6a8534f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -134,6 +134,8 @@ void startMemTracking(){ tensors_ptr.clear(); host_ptr.clear(); obj_ptr.clear(); + + tracked_tensors.clear(); } @@ -1287,8 +1289,7 @@ void* FCLayer_GPU(void* input, /*********** PROMISE API **************/ - - +/* void* ConvLayer_PROMISE(void* input, float i_min, float i_max, void* filter, float w_min, float w_max, void* bias, float b_min, float b_max, @@ -1359,6 +1360,10 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); */ + + + + /* ----- } else if(swing == 9 || (swing >= 16 && swing <= 19) ){ //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, @@ -1390,7 +1395,8 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); */ - + + /*------ } else if(swing == 10){ conv_out = tensorHalfConvolution(input, filter, @@ -1549,7 +1555,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, return activation_out; } - +*****/ diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/soc_simulator/src/table_generator.py similarity index 100% rename from llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py rename to llvm/projects/soc_simulator/src/table_generator.py diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile index 4e762ea9894405bb375f518b65c209b4129d9f70..83b4dc9431ee84051def8a0f6850e7f2c194f033 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile @@ -1,5 +1,6 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks # NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -15,9 +16,10 @@ APP = alexnet TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a - CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL @@ -58,15 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - $(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc - $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_linked.bc - $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_loop_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_linked $(LINKER_FLAGS) - #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3bc2335227cf06169b1f3d105314fdc9647d97d --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/tuner_confs_base.txt @@ -0,0 +1,20 @@ ++++++ +conf1 1 0 79.9 0 +1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 1 add fp32 1 tanh fp32 1 +4 gpu conv fp32 1 add fp32 1 tanh fp32 1 +5 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +6 gpu mul fp32 1 add fp32 1 +7 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 79.9 0 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 1 add fp16 1 tanh fp16 1 +4 gpu conv fp16 1 add fp16 1 tanh fp16 1 +5 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 1 add fp16 1 +7 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp index ee07bdd8f9901f1582d5f7642a2a86c099397a14..d92bc0c45d1115620d529aea4636ece8d3d62127 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp @@ -9,8 +9,10 @@ #include <tensorTypes.h> #include <tensorUtils.h> + + void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 5, 5, 1, 1); @@ -18,7 +20,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -26,7 +28,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_2_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -34,7 +36,7 @@ void var_2_node(void* t1, size_t bytes_t1) { } void var_3_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -42,7 +44,7 @@ void var_3_node(void* t1, size_t bytes_t1) { } void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -50,7 +52,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -58,7 +60,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_6_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -66,7 +68,7 @@ void var_6_node(void* t1, size_t bytes_t1) { } void var_7_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -74,7 +76,7 @@ void var_7_node(void* t1, size_t bytes_t1) { } void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -82,7 +84,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -90,7 +92,7 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_10_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -98,7 +100,7 @@ void var_10_node(void* t1, size_t bytes_t1) { } void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -106,7 +108,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -114,7 +116,7 @@ void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_13_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -122,7 +124,7 @@ void var_13_node(void* t1, size_t bytes_t1) { } void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -130,7 +132,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -138,7 +140,7 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_16_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -146,7 +148,7 @@ void var_16_node(void* t1, size_t bytes_t1) { } void var_17_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -154,7 +156,7 @@ void var_17_node(void* t1, size_t bytes_t1) { } void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_mul(t1, t2); @@ -162,7 +164,7 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -177,6 +179,8 @@ void var_20_node(void* t1, size_t bytes_t1) { __visc__return(2, r, (size_t) 0); } + + void root(void* input, size_t input_bytes, void* conv2d_1_w, size_t conv2d_1_w_bytes, void* conv2d_1_b, size_t conv2d_1_b_bytes, @@ -371,9 +375,10 @@ int main(){ std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/alexnet_cifar10_test/"); - + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); std::string labels_path = dir_prefix + std::string("labels32.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(),10000); + uint8_t* labels = readLabels(labels_path.c_str(),5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); @@ -404,6 +409,8 @@ int main(){ __visc__init(); RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); + //args->input = input; + //args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -429,48 +436,38 @@ int main(){ args->dense_1_b = dense_1_b; args->dense_1_b_bytes = 0; - int batch_size = 500; - int test_input_size = 10000; - int batch_count = test_input_size / batch_size; - - std::string input_path = dir_prefix + std::string("input.bin"); + int batch_size = 500; + int test_input_size = 10000; + int batch_count = test_input_size / batch_size; + void* input = create4DTensor(0,nchw,batch_size,3,32,32); - startMemTracking(); - for (int i = 0; i < batch_count; i++){ + startProfiling(); - int start = i * batch_size; - int end = (i + 1) * batch_size; + for (int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; copyInputBatch(input_path.c_str(),start,end,3,32,32, input); - - args->input = input; + + args->input = input; args->input_bytes = 0; - - //void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* dfg = __visc__launch(0, root, (void*) args); + + void* dfg = __visc__launch(0, root, (void*) args); __visc__wait(dfg); + + void *result = static_cast<RootIn*>(args)->input; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - void *result = static_cast<RootIn*>(args)->input; - hpvm_request_tensor(result, 0); - - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - computeAccuracy3(labels, result); - - llvm_hpvm_invokeRtControl2(result, labels); - freeBatchMemory(); } - - - __visc__cleanup(); - - + stopProfiling(); + __visc__cleanup(); + return 0; - -} +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b4d8bcd26563a1f398df34800ad2b70f24a670 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/tuner_confs_base.txt @@ -0,0 +1,16 @@ ++++++ +conf1 1 0 98.9 0 +1 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 +2 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 +3 gpu mul fp32 1 add fp32 1 tanh fp32 1 +4 gpu mul fp32 1 add fp32 1 tanh fp32 1 +5 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 98.9 0 +1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt new file mode 100644 index 0000000000000000000000000000000000000000..75211f858c1cc9eb6a186dc7f90c143ea820ef67 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/quant_ranges_rt.txt @@ -0,0 +1,15 @@ +1 -1.9892114 2.126797 -2.19630692005 1.34758170414 0.0 0.0 -60.892750473 51.9925691605 +2 0.0 5.71354155397 -0.931772116065 1.07742589378 0.0 0.0 -6.51858950329 6.81084251881 +3 0.0 4.93213940287 -0.531654466152 0.57537904036 0.0 0.0 -4.48263123512 3.96730119753 +4 0.0 4.10326339769 -0.362340988219 0.407691390038 0.0 0.0 -4.04261828327 3.8867793293 +5 0.0 5.38322130251 -0.313120054901 0.293576799393 0.0 0.0 -5.92146921539 4.33867932415 +6 0.0 4.31673815441 -0.232992478013 0.258029025793 0.0 0.0 -4.20778994751 3.93243697071 +7 0.0 5.8304081068 -0.202337772191 0.189983081758 0.0 0.0 -6.29828691578 4.84813511753 +8 0.0 4.44641780996 -0.174427356511 0.176958308667 0.0 0.0 -4.34791088581 3.61443646955 +9 0.0 4.5180956049 -0.145467961878 0.15256431669 0.0 0.0 -3.02877027559 2.94873657799 +10 0.0 6.34857563496 -0.130258745223 0.135582433432 0.0 0.0 -4.22931008053 3.53150463724 +11 0.0 5.22100311041 -0.119001727596 0.125363747835 0.0 0.0 -4.03820378017 4.00400940704 +12 0.0 5.73249834776 -0.108397216856 0.116256686077 0.0 0.0 -3.31110151148 4.46293323326 +13 0.0 7.24049821186 -0.0862374496162 0.0885944995135 0.0 0.0 -4.17543139458 6.2043294754 +14 0.0 7.81395883465 -0.0681302513927 0.0700202777982 0.0 0.0 -10.9205664234 2.64429125786 +15 0.0 2.86920666504 -0.223010196954 0.14426593782 -0.1654396 0.23336112 -12.2459499588 23.8053251343 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b5c1727ad0dc9e24310e4c86e116894051c84b3 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/tuner_confs_base.txt @@ -0,0 +1,174 @@ ++++++ +conf1 1 0 84.8 0 +1 gpu conv fp32 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 +7 gpu conv fp32 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 +13 gpu conv fp32 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 +25 gpu conv fp32 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 +31 gpu conv fp32 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 +37 gpu conv fp32 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 +43 gpu conv fp32 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 +49 gpu conv fp32 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 +55 gpu conv fp32 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 +61 gpu conv fp32 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 +67 gpu conv fp32 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 +73 gpu conv fp32 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 +79 gpu conv fp32 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 +83 gpu mul fp32 1 add fp32 1 +84 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 84.8 0 +1 gpu conv fp16 1 +2 gpu batchnorm fp16 1 +3 gpu relu fp16 1 +4 gpu group_conv fp16 1 +5 gpu batchnorm fp16 1 +6 gpu relu fp16 1 +7 gpu conv fp16 1 +8 gpu batchnorm fp16 1 +9 gpu relu fp16 1 +10 gpu group_conv fp16 1 +11 gpu batchnorm fp16 1 +12 gpu relu fp16 1 +13 gpu conv fp16 1 +14 gpu batchnorm fp16 1 +15 gpu relu fp16 1 +16 gpu group_conv fp16 1 +17 gpu batchnorm fp16 1 +18 gpu relu fp16 1 +19 gpu conv fp16 1 +20 gpu batchnorm fp16 1 +21 gpu relu fp16 1 +22 gpu group_conv fp16 1 +23 gpu batchnorm fp16 1 +24 gpu relu fp16 1 +25 gpu conv fp16 1 +26 gpu batchnorm fp16 1 +27 gpu relu fp16 1 +28 gpu group_conv fp16 1 +29 gpu batchnorm fp16 1 +30 gpu relu fp16 1 +31 gpu conv fp16 1 +32 gpu batchnorm fp16 1 +33 gpu relu fp16 1 +34 gpu group_conv fp16 1 +35 gpu batchnorm fp16 1 +36 gpu relu fp16 1 +37 gpu conv fp16 1 +38 gpu batchnorm fp16 1 +39 gpu relu fp16 1 +40 gpu group_conv fp16 1 +41 gpu batchnorm fp16 1 +42 gpu relu fp16 1 +43 gpu conv fp16 1 +44 gpu batchnorm fp16 1 +45 gpu relu fp16 1 +46 gpu group_conv fp16 1 +47 gpu batchnorm fp16 1 +48 gpu relu fp16 1 +49 gpu conv fp16 1 +50 gpu batchnorm fp16 1 +51 gpu relu fp16 1 +52 gpu group_conv fp16 1 +53 gpu batchnorm fp16 1 +54 gpu relu fp16 1 +55 gpu conv fp16 1 +56 gpu batchnorm fp16 1 +57 gpu relu fp16 1 +58 gpu group_conv fp16 1 +59 gpu batchnorm fp16 1 +60 gpu relu fp16 1 +61 gpu conv fp16 1 +62 gpu batchnorm fp16 1 +63 gpu relu fp16 1 +64 gpu group_conv fp16 1 +65 gpu batchnorm fp16 1 +66 gpu relu fp16 1 +67 gpu conv fp16 1 +68 gpu batchnorm fp16 1 +69 gpu relu fp16 1 +70 gpu group_conv fp16 1 +71 gpu batchnorm fp16 1 +72 gpu relu fp16 1 +73 gpu conv fp16 1 +74 gpu batchnorm fp16 1 +75 gpu relu fp16 1 +76 gpu group_conv fp16 1 +77 gpu batchnorm fp16 1 +78 gpu relu fp16 1 +79 gpu conv fp16 1 +80 gpu batchnorm fp16 1 +81 gpu relu fp16 1 +82 gpu pool_mean fp16 1 +83 gpu mul fp16 1 add fp16 1 +84 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b3c537c5fbe845dbf9c97e24e8841e45ed3084f --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges.txt @@ -0,0 +1,8 @@ +-1.9892114 2.126797 -1.51646211648 1.64720817745 -9.86898064232 10.5609560184 +0.0 6.82138112736 -1.18343908739 1.27315966272 -9.87599849701 7.51305247974 +0.0 4.82606745577 -0.599876856983 0.681207345724 -5.63328983307 5.17789223576 +0.0 4.02646304417 -0.455596786201 0.494261391461 -5.31680394173 4.60585025024 +0.0 4.53264906311 -0.356576155901 0.338216508806 -6.1012511816 4.36305006886 +0.0 3.98747043872 -0.285027833283 0.286046403348 -4.24385170364 3.48625040674 +0.0 6.56306590176 -0.189464023232 0.190123907179 -4.93811571312 3.53836347675 +0.0 1.89083880007 -0.351403944016 0.422872786462 -0.23878151 0.26507422 -14.6308162231 27.2725212326 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a83ec095ec99c762d5ff05e2749db13db47909a --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/quant_ranges_rt.txt @@ -0,0 +1,8 @@ +1 -1.9892114 2.126797 -1.51646211648 1.64720817745 -9.86898064232 10.5609560184 +2 0.0 6.82138112736 -1.18343908739 1.27315966272 -9.87599849701 7.51305247974 +3 0.0 4.82606745577 -0.599876856983 0.681207345724 -5.63328983307 5.17789223576 +4 0.0 4.02646304417 -0.455596786201 0.494261391461 -5.31680394173 4.60585025024 +5 0.0 4.53264906311 -0.356576155901 0.338216508806 -6.1012511816 4.36305006886 +6 0.0 3.98747043872 -0.285027833283 0.286046403348 -4.24385170364 3.48625040674 +7 0.0 6.56306590176 -0.189464023232 0.190123907179 -4.93811571312 3.53836347675 +8 0.0 1.89083880007 -0.351403944016 0.422872786462 -0.23878151 0.26507422 -14.6308162231 27.2725212326 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..501dfcc5e76d637d4e4136ac1c2486b6b4cbe639 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/data/tuner_confs_base.txt @@ -0,0 +1,90 @@ ++++++ +conf1 1 0 87.59 0 +1 gpu conv fp32 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 +7 gpu conv fp32 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 +13 gpu conv fp32 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 +25 gpu conv fp32 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 +31 gpu conv fp32 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 +37 gpu conv fp32 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu pool_mean fp32 1 +41 gpu mul fp32 1 add fp32 1 +42 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 87.59 0 +1 gpu conv fp16 1 +2 gpu batchnorm fp16 1 +3 gpu relu fp16 1 +4 gpu group_conv fp16 1 +5 gpu batchnorm fp16 1 +6 gpu relu fp16 1 +7 gpu conv fp16 1 +8 gpu batchnorm fp16 1 +9 gpu relu fp16 1 +10 gpu group_conv fp16 1 +11 gpu batchnorm fp16 1 +12 gpu relu fp16 1 +13 gpu conv fp16 1 +14 gpu batchnorm fp16 1 +15 gpu relu fp16 1 +16 gpu group_conv fp16 1 +17 gpu batchnorm fp16 1 +18 gpu relu fp16 1 +19 gpu conv fp16 1 +20 gpu batchnorm fp16 1 +21 gpu relu fp16 1 +22 gpu group_conv fp16 1 +23 gpu batchnorm fp16 1 +24 gpu relu fp16 1 +25 gpu conv fp16 1 +26 gpu batchnorm fp16 1 +27 gpu relu fp16 1 +28 gpu group_conv fp16 1 +29 gpu batchnorm fp16 1 +30 gpu relu fp16 1 +31 gpu conv fp16 1 +32 gpu batchnorm fp16 1 +33 gpu relu fp16 1 +34 gpu group_conv fp16 1 +35 gpu batchnorm fp16 1 +36 gpu relu fp16 1 +37 gpu conv fp16 1 +38 gpu batchnorm fp16 1 +39 gpu relu fp16 1 +40 gpu pool_mean fp16 1 +41 gpu mul fp16 1 add fp16 1 +42 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp new file mode 100644 index 0000000000000000000000000000000000000000..361fa0c1c44151cbefc98b6c983d17303d254eef --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet_shallow/src/mobilenet_shallow_promise.cpp @@ -0,0 +1,1225 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <cstring> +#include <visc.h> +#include <tensorTypes.h> +#include <tensorUtils.h> + +void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_2_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_3_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 32); + __visc__return(2, r, (size_t) 0); +} + +void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_5_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_8_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 64); + __visc__return(2, r, (size_t) 0); +} + +void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_11_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_14_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 128); + __visc__return(2, r, (size_t) 0); +} + +void var_16_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_17_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_20_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 128); + __visc__return(2, r, (size_t) 0); +} + +void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_23_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_26_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 1, 1, 1, 256); + __visc__return(2, r, (size_t) 0); +} + +void var_28_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_29_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_32_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_33_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_group_convolution(t1, t2, 1, 1, 2, 2, 1, 256); + __visc__return(2, r, (size_t) 0); +} + +void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_35_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_37_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(5, t1, t2, t3, t4, t5, 0); + + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); + __visc__return(2, r, (size_t) 0); +} + +void var_38_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_39_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_pool_mean(t1, 2, 2, 0, 0, 2, 2); + __visc__return(2, r, (size_t) 0); +} + +void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_mul(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_41_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::PROMISE_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_42_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_softmax(t1); + __visc__return(2, r, (size_t) 0); +} + +void root(void* input, size_t input_bytes, + void* conv2d_1_w, size_t conv2d_1_w_bytes, + void* batch_normalization_1_gamma, size_t batch_normalization_1_gamma_bytes, + void* batch_normalization_1_beta, size_t batch_normalization_1_beta_bytes, + void* batch_normalization_1_mean, size_t batch_normalization_1_mean_bytes, + void* batch_normalization_1_variance, size_t batch_normalization_1_variance_bytes, + void* depthwise_conv2d_1_w, size_t depthwise_conv2d_1_w_bytes, + void* batch_normalization_2_gamma, size_t batch_normalization_2_gamma_bytes, + void* batch_normalization_2_beta, size_t batch_normalization_2_beta_bytes, + void* batch_normalization_2_mean, size_t batch_normalization_2_mean_bytes, + void* batch_normalization_2_variance, size_t batch_normalization_2_variance_bytes, + void* conv2d_2_w, size_t conv2d_2_w_bytes, + void* batch_normalization_3_gamma, size_t batch_normalization_3_gamma_bytes, + void* batch_normalization_3_beta, size_t batch_normalization_3_beta_bytes, + void* batch_normalization_3_mean, size_t batch_normalization_3_mean_bytes, + void* batch_normalization_3_variance, size_t batch_normalization_3_variance_bytes, + void* depthwise_conv2d_2_w, size_t depthwise_conv2d_2_w_bytes, + void* batch_normalization_4_gamma, size_t batch_normalization_4_gamma_bytes, + void* batch_normalization_4_beta, size_t batch_normalization_4_beta_bytes, + void* batch_normalization_4_mean, size_t batch_normalization_4_mean_bytes, + void* batch_normalization_4_variance, size_t batch_normalization_4_variance_bytes, + void* conv2d_3_w, size_t conv2d_3_w_bytes, + void* batch_normalization_5_gamma, size_t batch_normalization_5_gamma_bytes, + void* batch_normalization_5_beta, size_t batch_normalization_5_beta_bytes, + void* batch_normalization_5_mean, size_t batch_normalization_5_mean_bytes, + void* batch_normalization_5_variance, size_t batch_normalization_5_variance_bytes, + void* depthwise_conv2d_3_w, size_t depthwise_conv2d_3_w_bytes, + void* batch_normalization_6_gamma, size_t batch_normalization_6_gamma_bytes, + void* batch_normalization_6_beta, size_t batch_normalization_6_beta_bytes, + void* batch_normalization_6_mean, size_t batch_normalization_6_mean_bytes, + void* batch_normalization_6_variance, size_t batch_normalization_6_variance_bytes, + void* conv2d_4_w, size_t conv2d_4_w_bytes, + void* batch_normalization_7_gamma, size_t batch_normalization_7_gamma_bytes, + void* batch_normalization_7_beta, size_t batch_normalization_7_beta_bytes, + void* batch_normalization_7_mean, size_t batch_normalization_7_mean_bytes, + void* batch_normalization_7_variance, size_t batch_normalization_7_variance_bytes, + void* depthwise_conv2d_4_w, size_t depthwise_conv2d_4_w_bytes, + void* batch_normalization_8_gamma, size_t batch_normalization_8_gamma_bytes, + void* batch_normalization_8_beta, size_t batch_normalization_8_beta_bytes, + void* batch_normalization_8_mean, size_t batch_normalization_8_mean_bytes, + void* batch_normalization_8_variance, size_t batch_normalization_8_variance_bytes, + void* conv2d_5_w, size_t conv2d_5_w_bytes, + void* batch_normalization_9_gamma, size_t batch_normalization_9_gamma_bytes, + void* batch_normalization_9_beta, size_t batch_normalization_9_beta_bytes, + void* batch_normalization_9_mean, size_t batch_normalization_9_mean_bytes, + void* batch_normalization_9_variance, size_t batch_normalization_9_variance_bytes, + void* depthwise_conv2d_5_w, size_t depthwise_conv2d_5_w_bytes, + void* batch_normalization_10_gamma, size_t batch_normalization_10_gamma_bytes, + void* batch_normalization_10_beta, size_t batch_normalization_10_beta_bytes, + void* batch_normalization_10_mean, size_t batch_normalization_10_mean_bytes, + void* batch_normalization_10_variance, size_t batch_normalization_10_variance_bytes, + void* conv2d_6_w, size_t conv2d_6_w_bytes, + void* batch_normalization_11_gamma, size_t batch_normalization_11_gamma_bytes, + void* batch_normalization_11_beta, size_t batch_normalization_11_beta_bytes, + void* batch_normalization_11_mean, size_t batch_normalization_11_mean_bytes, + void* batch_normalization_11_variance, size_t batch_normalization_11_variance_bytes, + void* depthwise_conv2d_6_w, size_t depthwise_conv2d_6_w_bytes, + void* batch_normalization_12_gamma, size_t batch_normalization_12_gamma_bytes, + void* batch_normalization_12_beta, size_t batch_normalization_12_beta_bytes, + void* batch_normalization_12_mean, size_t batch_normalization_12_mean_bytes, + void* batch_normalization_12_variance, size_t batch_normalization_12_variance_bytes, + void* conv2d_7_w, size_t conv2d_7_w_bytes, + void* batch_normalization_13_gamma, size_t batch_normalization_13_gamma_bytes, + void* batch_normalization_13_beta, size_t batch_normalization_13_beta_bytes, + void* batch_normalization_13_mean, size_t batch_normalization_13_mean_bytes, + void* batch_normalization_13_variance, size_t batch_normalization_13_variance_bytes, + void* dense_1_w, size_t dense_1_w_bytes, + void* dense_1_b, size_t dense_1_b_bytes){ + + + __visc__hint(visc::CPU_TARGET); + __visc__attributes(68, input, conv2d_1_w, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, depthwise_conv2d_1_w, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, conv2d_2_w, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, depthwise_conv2d_2_w, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, conv2d_3_w, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, depthwise_conv2d_3_w, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, conv2d_4_w, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, depthwise_conv2d_4_w, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, conv2d_5_w, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, depthwise_conv2d_5_w, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, conv2d_6_w, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, depthwise_conv2d_6_w, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, conv2d_7_w, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, dense_1_w, dense_1_b, 0); + + + void* var_0 = __visc__createNodeND(0, var_0_node); + + __visc__bindIn(var_0, 0, 0, 0); + __visc__bindIn(var_0, 1, 1, 0); + __visc__bindIn(var_0, 2, 2, 0); + __visc__bindIn(var_0, 3, 3, 0); + + void* var_1 = __visc__createNodeND(0, var_1_node); + + __visc__edge(var_0, var_1, 1, 0, 0, 0); + __visc__edge(var_0, var_1, 1, 1, 1, 0); + __visc__bindIn(var_1, 4, 2, 0); + __visc__bindIn(var_1, 5, 3, 0); + __visc__bindIn(var_1, 6, 4, 0); + __visc__bindIn(var_1, 7, 5, 0); + __visc__bindIn(var_1, 8, 6, 0); + __visc__bindIn(var_1, 9, 7, 0); + __visc__bindIn(var_1, 10, 8, 0); + __visc__bindIn(var_1, 11, 9, 0); + + void* var_2 = __visc__createNodeND(0, var_2_node); + + __visc__edge(var_1, var_2, 1, 0, 0, 0); + __visc__edge(var_1, var_2, 1, 1, 1, 0); + + void* var_3 = __visc__createNodeND(0, var_3_node); + + __visc__edge(var_2, var_3, 1, 0, 0, 0); + __visc__edge(var_2, var_3, 1, 1, 1, 0); + __visc__bindIn(var_3, 12, 2, 0); + __visc__bindIn(var_3, 13, 3, 0); + + void* var_4 = __visc__createNodeND(0, var_4_node); + + __visc__edge(var_3, var_4, 1, 0, 0, 0); + __visc__edge(var_3, var_4, 1, 1, 1, 0); + __visc__bindIn(var_4, 14, 2, 0); + __visc__bindIn(var_4, 15, 3, 0); + __visc__bindIn(var_4, 16, 4, 0); + __visc__bindIn(var_4, 17, 5, 0); + __visc__bindIn(var_4, 18, 6, 0); + __visc__bindIn(var_4, 19, 7, 0); + __visc__bindIn(var_4, 20, 8, 0); + __visc__bindIn(var_4, 21, 9, 0); + + void* var_5 = __visc__createNodeND(0, var_5_node); + + __visc__edge(var_4, var_5, 1, 0, 0, 0); + __visc__edge(var_4, var_5, 1, 1, 1, 0); + + void* var_6 = __visc__createNodeND(0, var_6_node); + + __visc__edge(var_5, var_6, 1, 0, 0, 0); + __visc__edge(var_5, var_6, 1, 1, 1, 0); + __visc__bindIn(var_6, 22, 2, 0); + __visc__bindIn(var_6, 23, 3, 0); + + void* var_7 = __visc__createNodeND(0, var_7_node); + + __visc__edge(var_6, var_7, 1, 0, 0, 0); + __visc__edge(var_6, var_7, 1, 1, 1, 0); + __visc__bindIn(var_7, 24, 2, 0); + __visc__bindIn(var_7, 25, 3, 0); + __visc__bindIn(var_7, 26, 4, 0); + __visc__bindIn(var_7, 27, 5, 0); + __visc__bindIn(var_7, 28, 6, 0); + __visc__bindIn(var_7, 29, 7, 0); + __visc__bindIn(var_7, 30, 8, 0); + __visc__bindIn(var_7, 31, 9, 0); + + void* var_8 = __visc__createNodeND(0, var_8_node); + + __visc__edge(var_7, var_8, 1, 0, 0, 0); + __visc__edge(var_7, var_8, 1, 1, 1, 0); + + void* var_9 = __visc__createNodeND(0, var_9_node); + + __visc__edge(var_8, var_9, 1, 0, 0, 0); + __visc__edge(var_8, var_9, 1, 1, 1, 0); + __visc__bindIn(var_9, 32, 2, 0); + __visc__bindIn(var_9, 33, 3, 0); + + void* var_10 = __visc__createNodeND(0, var_10_node); + + __visc__edge(var_9, var_10, 1, 0, 0, 0); + __visc__edge(var_9, var_10, 1, 1, 1, 0); + __visc__bindIn(var_10, 34, 2, 0); + __visc__bindIn(var_10, 35, 3, 0); + __visc__bindIn(var_10, 36, 4, 0); + __visc__bindIn(var_10, 37, 5, 0); + __visc__bindIn(var_10, 38, 6, 0); + __visc__bindIn(var_10, 39, 7, 0); + __visc__bindIn(var_10, 40, 8, 0); + __visc__bindIn(var_10, 41, 9, 0); + + void* var_11 = __visc__createNodeND(0, var_11_node); + + __visc__edge(var_10, var_11, 1, 0, 0, 0); + __visc__edge(var_10, var_11, 1, 1, 1, 0); + + void* var_12 = __visc__createNodeND(0, var_12_node); + + __visc__edge(var_11, var_12, 1, 0, 0, 0); + __visc__edge(var_11, var_12, 1, 1, 1, 0); + __visc__bindIn(var_12, 42, 2, 0); + __visc__bindIn(var_12, 43, 3, 0); + + void* var_13 = __visc__createNodeND(0, var_13_node); + + __visc__edge(var_12, var_13, 1, 0, 0, 0); + __visc__edge(var_12, var_13, 1, 1, 1, 0); + __visc__bindIn(var_13, 44, 2, 0); + __visc__bindIn(var_13, 45, 3, 0); + __visc__bindIn(var_13, 46, 4, 0); + __visc__bindIn(var_13, 47, 5, 0); + __visc__bindIn(var_13, 48, 6, 0); + __visc__bindIn(var_13, 49, 7, 0); + __visc__bindIn(var_13, 50, 8, 0); + __visc__bindIn(var_13, 51, 9, 0); + + void* var_14 = __visc__createNodeND(0, var_14_node); + + __visc__edge(var_13, var_14, 1, 0, 0, 0); + __visc__edge(var_13, var_14, 1, 1, 1, 0); + + void* var_15 = __visc__createNodeND(0, var_15_node); + + __visc__edge(var_14, var_15, 1, 0, 0, 0); + __visc__edge(var_14, var_15, 1, 1, 1, 0); + __visc__bindIn(var_15, 52, 2, 0); + __visc__bindIn(var_15, 53, 3, 0); + + void* var_16 = __visc__createNodeND(0, var_16_node); + + __visc__edge(var_15, var_16, 1, 0, 0, 0); + __visc__edge(var_15, var_16, 1, 1, 1, 0); + __visc__bindIn(var_16, 54, 2, 0); + __visc__bindIn(var_16, 55, 3, 0); + __visc__bindIn(var_16, 56, 4, 0); + __visc__bindIn(var_16, 57, 5, 0); + __visc__bindIn(var_16, 58, 6, 0); + __visc__bindIn(var_16, 59, 7, 0); + __visc__bindIn(var_16, 60, 8, 0); + __visc__bindIn(var_16, 61, 9, 0); + + void* var_17 = __visc__createNodeND(0, var_17_node); + + __visc__edge(var_16, var_17, 1, 0, 0, 0); + __visc__edge(var_16, var_17, 1, 1, 1, 0); + + void* var_18 = __visc__createNodeND(0, var_18_node); + + __visc__edge(var_17, var_18, 1, 0, 0, 0); + __visc__edge(var_17, var_18, 1, 1, 1, 0); + __visc__bindIn(var_18, 62, 2, 0); + __visc__bindIn(var_18, 63, 3, 0); + + void* var_19 = __visc__createNodeND(0, var_19_node); + + __visc__edge(var_18, var_19, 1, 0, 0, 0); + __visc__edge(var_18, var_19, 1, 1, 1, 0); + __visc__bindIn(var_19, 64, 2, 0); + __visc__bindIn(var_19, 65, 3, 0); + __visc__bindIn(var_19, 66, 4, 0); + __visc__bindIn(var_19, 67, 5, 0); + __visc__bindIn(var_19, 68, 6, 0); + __visc__bindIn(var_19, 69, 7, 0); + __visc__bindIn(var_19, 70, 8, 0); + __visc__bindIn(var_19, 71, 9, 0); + + void* var_20 = __visc__createNodeND(0, var_20_node); + + __visc__edge(var_19, var_20, 1, 0, 0, 0); + __visc__edge(var_19, var_20, 1, 1, 1, 0); + + void* var_21 = __visc__createNodeND(0, var_21_node); + + __visc__edge(var_20, var_21, 1, 0, 0, 0); + __visc__edge(var_20, var_21, 1, 1, 1, 0); + __visc__bindIn(var_21, 72, 2, 0); + __visc__bindIn(var_21, 73, 3, 0); + + void* var_22 = __visc__createNodeND(0, var_22_node); + + __visc__edge(var_21, var_22, 1, 0, 0, 0); + __visc__edge(var_21, var_22, 1, 1, 1, 0); + __visc__bindIn(var_22, 74, 2, 0); + __visc__bindIn(var_22, 75, 3, 0); + __visc__bindIn(var_22, 76, 4, 0); + __visc__bindIn(var_22, 77, 5, 0); + __visc__bindIn(var_22, 78, 6, 0); + __visc__bindIn(var_22, 79, 7, 0); + __visc__bindIn(var_22, 80, 8, 0); + __visc__bindIn(var_22, 81, 9, 0); + + void* var_23 = __visc__createNodeND(0, var_23_node); + + __visc__edge(var_22, var_23, 1, 0, 0, 0); + __visc__edge(var_22, var_23, 1, 1, 1, 0); + + void* var_24 = __visc__createNodeND(0, var_24_node); + + __visc__edge(var_23, var_24, 1, 0, 0, 0); + __visc__edge(var_23, var_24, 1, 1, 1, 0); + __visc__bindIn(var_24, 82, 2, 0); + __visc__bindIn(var_24, 83, 3, 0); + + void* var_25 = __visc__createNodeND(0, var_25_node); + + __visc__edge(var_24, var_25, 1, 0, 0, 0); + __visc__edge(var_24, var_25, 1, 1, 1, 0); + __visc__bindIn(var_25, 84, 2, 0); + __visc__bindIn(var_25, 85, 3, 0); + __visc__bindIn(var_25, 86, 4, 0); + __visc__bindIn(var_25, 87, 5, 0); + __visc__bindIn(var_25, 88, 6, 0); + __visc__bindIn(var_25, 89, 7, 0); + __visc__bindIn(var_25, 90, 8, 0); + __visc__bindIn(var_25, 91, 9, 0); + + void* var_26 = __visc__createNodeND(0, var_26_node); + + __visc__edge(var_25, var_26, 1, 0, 0, 0); + __visc__edge(var_25, var_26, 1, 1, 1, 0); + + void* var_27 = __visc__createNodeND(0, var_27_node); + + __visc__edge(var_26, var_27, 1, 0, 0, 0); + __visc__edge(var_26, var_27, 1, 1, 1, 0); + __visc__bindIn(var_27, 92, 2, 0); + __visc__bindIn(var_27, 93, 3, 0); + + void* var_28 = __visc__createNodeND(0, var_28_node); + + __visc__edge(var_27, var_28, 1, 0, 0, 0); + __visc__edge(var_27, var_28, 1, 1, 1, 0); + __visc__bindIn(var_28, 94, 2, 0); + __visc__bindIn(var_28, 95, 3, 0); + __visc__bindIn(var_28, 96, 4, 0); + __visc__bindIn(var_28, 97, 5, 0); + __visc__bindIn(var_28, 98, 6, 0); + __visc__bindIn(var_28, 99, 7, 0); + __visc__bindIn(var_28, 100, 8, 0); + __visc__bindIn(var_28, 101, 9, 0); + + void* var_29 = __visc__createNodeND(0, var_29_node); + + __visc__edge(var_28, var_29, 1, 0, 0, 0); + __visc__edge(var_28, var_29, 1, 1, 1, 0); + + void* var_30 = __visc__createNodeND(0, var_30_node); + + __visc__edge(var_29, var_30, 1, 0, 0, 0); + __visc__edge(var_29, var_30, 1, 1, 1, 0); + __visc__bindIn(var_30, 102, 2, 0); + __visc__bindIn(var_30, 103, 3, 0); + + void* var_31 = __visc__createNodeND(0, var_31_node); + + __visc__edge(var_30, var_31, 1, 0, 0, 0); + __visc__edge(var_30, var_31, 1, 1, 1, 0); + __visc__bindIn(var_31, 104, 2, 0); + __visc__bindIn(var_31, 105, 3, 0); + __visc__bindIn(var_31, 106, 4, 0); + __visc__bindIn(var_31, 107, 5, 0); + __visc__bindIn(var_31, 108, 6, 0); + __visc__bindIn(var_31, 109, 7, 0); + __visc__bindIn(var_31, 110, 8, 0); + __visc__bindIn(var_31, 111, 9, 0); + + void* var_32 = __visc__createNodeND(0, var_32_node); + + __visc__edge(var_31, var_32, 1, 0, 0, 0); + __visc__edge(var_31, var_32, 1, 1, 1, 0); + + void* var_33 = __visc__createNodeND(0, var_33_node); + + __visc__edge(var_32, var_33, 1, 0, 0, 0); + __visc__edge(var_32, var_33, 1, 1, 1, 0); + __visc__bindIn(var_33, 112, 2, 0); + __visc__bindIn(var_33, 113, 3, 0); + + void* var_34 = __visc__createNodeND(0, var_34_node); + + __visc__edge(var_33, var_34, 1, 0, 0, 0); + __visc__edge(var_33, var_34, 1, 1, 1, 0); + __visc__bindIn(var_34, 114, 2, 0); + __visc__bindIn(var_34, 115, 3, 0); + __visc__bindIn(var_34, 116, 4, 0); + __visc__bindIn(var_34, 117, 5, 0); + __visc__bindIn(var_34, 118, 6, 0); + __visc__bindIn(var_34, 119, 7, 0); + __visc__bindIn(var_34, 120, 8, 0); + __visc__bindIn(var_34, 121, 9, 0); + + void* var_35 = __visc__createNodeND(0, var_35_node); + + __visc__edge(var_34, var_35, 1, 0, 0, 0); + __visc__edge(var_34, var_35, 1, 1, 1, 0); + + void* var_36 = __visc__createNodeND(0, var_36_node); + + __visc__edge(var_35, var_36, 1, 0, 0, 0); + __visc__edge(var_35, var_36, 1, 1, 1, 0); + __visc__bindIn(var_36, 122, 2, 0); + __visc__bindIn(var_36, 123, 3, 0); + + void* var_37 = __visc__createNodeND(0, var_37_node); + + __visc__edge(var_36, var_37, 1, 0, 0, 0); + __visc__edge(var_36, var_37, 1, 1, 1, 0); + __visc__bindIn(var_37, 124, 2, 0); + __visc__bindIn(var_37, 125, 3, 0); + __visc__bindIn(var_37, 126, 4, 0); + __visc__bindIn(var_37, 127, 5, 0); + __visc__bindIn(var_37, 128, 6, 0); + __visc__bindIn(var_37, 129, 7, 0); + __visc__bindIn(var_37, 130, 8, 0); + __visc__bindIn(var_37, 131, 9, 0); + + void* var_38 = __visc__createNodeND(0, var_38_node); + + __visc__edge(var_37, var_38, 1, 0, 0, 0); + __visc__edge(var_37, var_38, 1, 1, 1, 0); + + void* var_39 = __visc__createNodeND(0, var_39_node); + + __visc__edge(var_38, var_39, 1, 0, 0, 0); + __visc__edge(var_38, var_39, 1, 1, 1, 0); + + void* var_40 = __visc__createNodeND(0, var_40_node); + + __visc__edge(var_39, var_40, 1, 0, 0, 0); + __visc__edge(var_39, var_40, 1, 1, 1, 0); + __visc__bindIn(var_40, 132, 2, 0); + __visc__bindIn(var_40, 133, 3, 0); + + void* var_41 = __visc__createNodeND(0, var_41_node); + + __visc__edge(var_40, var_41, 1, 0, 0, 0); + __visc__edge(var_40, var_41, 1, 1, 1, 0); + __visc__bindIn(var_41, 134, 2, 0); + __visc__bindIn(var_41, 135, 3, 0); + + void* var_42 = __visc__createNodeND(0, var_42_node); + + __visc__edge(var_41, var_42, 1, 0, 0, 0); + __visc__edge(var_41, var_42, 1, 1, 1, 0); + + __visc__bindOut(var_42, 0, 0, 0); + __visc__bindOut(var_42, 1, 1, 0); + +} + +struct ret_t { + void* tensor; + size_t bytes; +}; + +typedef struct __attribute__((__packed__)) { + void* input; + size_t input_bytes; + void* conv2d_1_w; + size_t conv2d_1_w_bytes; + void* batch_normalization_1_gamma; + size_t batch_normalization_1_gamma_bytes; + void* batch_normalization_1_beta; + size_t batch_normalization_1_beta_bytes; + void* batch_normalization_1_mean; + size_t batch_normalization_1_mean_bytes; + void* batch_normalization_1_variance; + size_t batch_normalization_1_variance_bytes; + void* depthwise_conv2d_1_w; + size_t depthwise_conv2d_1_w_bytes; + void* batch_normalization_2_gamma; + size_t batch_normalization_2_gamma_bytes; + void* batch_normalization_2_beta; + size_t batch_normalization_2_beta_bytes; + void* batch_normalization_2_mean; + size_t batch_normalization_2_mean_bytes; + void* batch_normalization_2_variance; + size_t batch_normalization_2_variance_bytes; + void* conv2d_2_w; + size_t conv2d_2_w_bytes; + void* batch_normalization_3_gamma; + size_t batch_normalization_3_gamma_bytes; + void* batch_normalization_3_beta; + size_t batch_normalization_3_beta_bytes; + void* batch_normalization_3_mean; + size_t batch_normalization_3_mean_bytes; + void* batch_normalization_3_variance; + size_t batch_normalization_3_variance_bytes; + void* depthwise_conv2d_2_w; + size_t depthwise_conv2d_2_w_bytes; + void* batch_normalization_4_gamma; + size_t batch_normalization_4_gamma_bytes; + void* batch_normalization_4_beta; + size_t batch_normalization_4_beta_bytes; + void* batch_normalization_4_mean; + size_t batch_normalization_4_mean_bytes; + void* batch_normalization_4_variance; + size_t batch_normalization_4_variance_bytes; + void* conv2d_3_w; + size_t conv2d_3_w_bytes; + void* batch_normalization_5_gamma; + size_t batch_normalization_5_gamma_bytes; + void* batch_normalization_5_beta; + size_t batch_normalization_5_beta_bytes; + void* batch_normalization_5_mean; + size_t batch_normalization_5_mean_bytes; + void* batch_normalization_5_variance; + size_t batch_normalization_5_variance_bytes; + void* depthwise_conv2d_3_w; + size_t depthwise_conv2d_3_w_bytes; + void* batch_normalization_6_gamma; + size_t batch_normalization_6_gamma_bytes; + void* batch_normalization_6_beta; + size_t batch_normalization_6_beta_bytes; + void* batch_normalization_6_mean; + size_t batch_normalization_6_mean_bytes; + void* batch_normalization_6_variance; + size_t batch_normalization_6_variance_bytes; + void* conv2d_4_w; + size_t conv2d_4_w_bytes; + void* batch_normalization_7_gamma; + size_t batch_normalization_7_gamma_bytes; + void* batch_normalization_7_beta; + size_t batch_normalization_7_beta_bytes; + void* batch_normalization_7_mean; + size_t batch_normalization_7_mean_bytes; + void* batch_normalization_7_variance; + size_t batch_normalization_7_variance_bytes; + void* depthwise_conv2d_4_w; + size_t depthwise_conv2d_4_w_bytes; + void* batch_normalization_8_gamma; + size_t batch_normalization_8_gamma_bytes; + void* batch_normalization_8_beta; + size_t batch_normalization_8_beta_bytes; + void* batch_normalization_8_mean; + size_t batch_normalization_8_mean_bytes; + void* batch_normalization_8_variance; + size_t batch_normalization_8_variance_bytes; + void* conv2d_5_w; + size_t conv2d_5_w_bytes; + void* batch_normalization_9_gamma; + size_t batch_normalization_9_gamma_bytes; + void* batch_normalization_9_beta; + size_t batch_normalization_9_beta_bytes; + void* batch_normalization_9_mean; + size_t batch_normalization_9_mean_bytes; + void* batch_normalization_9_variance; + size_t batch_normalization_9_variance_bytes; + void* depthwise_conv2d_5_w; + size_t depthwise_conv2d_5_w_bytes; + void* batch_normalization_10_gamma; + size_t batch_normalization_10_gamma_bytes; + void* batch_normalization_10_beta; + size_t batch_normalization_10_beta_bytes; + void* batch_normalization_10_mean; + size_t batch_normalization_10_mean_bytes; + void* batch_normalization_10_variance; + size_t batch_normalization_10_variance_bytes; + void* conv2d_6_w; + size_t conv2d_6_w_bytes; + void* batch_normalization_11_gamma; + size_t batch_normalization_11_gamma_bytes; + void* batch_normalization_11_beta; + size_t batch_normalization_11_beta_bytes; + void* batch_normalization_11_mean; + size_t batch_normalization_11_mean_bytes; + void* batch_normalization_11_variance; + size_t batch_normalization_11_variance_bytes; + void* depthwise_conv2d_6_w; + size_t depthwise_conv2d_6_w_bytes; + void* batch_normalization_12_gamma; + size_t batch_normalization_12_gamma_bytes; + void* batch_normalization_12_beta; + size_t batch_normalization_12_beta_bytes; + void* batch_normalization_12_mean; + size_t batch_normalization_12_mean_bytes; + void* batch_normalization_12_variance; + size_t batch_normalization_12_variance_bytes; + void* conv2d_7_w; + size_t conv2d_7_w_bytes; + void* batch_normalization_13_gamma; + size_t batch_normalization_13_gamma_bytes; + void* batch_normalization_13_beta; + size_t batch_normalization_13_beta_bytes; + void* batch_normalization_13_mean; + size_t batch_normalization_13_mean_bytes; + void* batch_normalization_13_variance; + size_t batch_normalization_13_variance_bytes; + void* dense_1_w; + size_t dense_1_w_bytes; + void* dense_1_b; + size_t dense_1_b_bytes; + + struct ret_t r; +} +RootIn; + +int main(){ + + std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/mobilenet_shallow/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); + std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); + void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); + void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); + void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); + std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); + std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); + void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); + void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); + std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); + void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); + std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); + void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); + void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); + void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); + std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); + std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); + void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); + void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); + std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); + void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); + std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); + void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); + void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); + void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); + std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); + std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); + void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); + void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); + void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); + std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); + void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); + void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); + void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); + std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); + std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); + void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); + void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); + std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); + void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); + std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); + void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); + void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); + void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); + std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); + std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); + void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); + void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); + void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); + std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); + void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); + void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); + void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); + std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); + std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); + void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); + void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); + std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); + void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); + std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); + void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); + void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); + std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); + void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + void* input = readTrainedWeights(input_path.c_str(), 0, 5000,3,32,32); + uint8_t* labels = readLabels(labels_path.c_str(), 5000); + + __visc__init(); + RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); + + args->input = input; + args->input_bytes = 0; + args->conv2d_1_w = conv2d_1_w; + args->conv2d_1_w_bytes = 0; + args->batch_normalization_1_gamma = batch_normalization_1_gamma; + args->batch_normalization_1_gamma_bytes = 0; + args->batch_normalization_1_beta = batch_normalization_1_beta; + args->batch_normalization_1_beta_bytes = 0; + args->batch_normalization_1_mean = batch_normalization_1_mean; + args->batch_normalization_1_mean_bytes = 0; + args->batch_normalization_1_variance = batch_normalization_1_variance; + args->batch_normalization_1_variance_bytes = 0; + args->depthwise_conv2d_1_w = depthwise_conv2d_1_w; + args->depthwise_conv2d_1_w_bytes = 0; + args->batch_normalization_2_gamma = batch_normalization_2_gamma; + args->batch_normalization_2_gamma_bytes = 0; + args->batch_normalization_2_beta = batch_normalization_2_beta; + args->batch_normalization_2_beta_bytes = 0; + args->batch_normalization_2_mean = batch_normalization_2_mean; + args->batch_normalization_2_mean_bytes = 0; + args->batch_normalization_2_variance = batch_normalization_2_variance; + args->batch_normalization_2_variance_bytes = 0; + args->conv2d_2_w = conv2d_2_w; + args->conv2d_2_w_bytes = 0; + args->batch_normalization_3_gamma = batch_normalization_3_gamma; + args->batch_normalization_3_gamma_bytes = 0; + args->batch_normalization_3_beta = batch_normalization_3_beta; + args->batch_normalization_3_beta_bytes = 0; + args->batch_normalization_3_mean = batch_normalization_3_mean; + args->batch_normalization_3_mean_bytes = 0; + args->batch_normalization_3_variance = batch_normalization_3_variance; + args->batch_normalization_3_variance_bytes = 0; + args->depthwise_conv2d_2_w = depthwise_conv2d_2_w; + args->depthwise_conv2d_2_w_bytes = 0; + args->batch_normalization_4_gamma = batch_normalization_4_gamma; + args->batch_normalization_4_gamma_bytes = 0; + args->batch_normalization_4_beta = batch_normalization_4_beta; + args->batch_normalization_4_beta_bytes = 0; + args->batch_normalization_4_mean = batch_normalization_4_mean; + args->batch_normalization_4_mean_bytes = 0; + args->batch_normalization_4_variance = batch_normalization_4_variance; + args->batch_normalization_4_variance_bytes = 0; + args->conv2d_3_w = conv2d_3_w; + args->conv2d_3_w_bytes = 0; + args->batch_normalization_5_gamma = batch_normalization_5_gamma; + args->batch_normalization_5_gamma_bytes = 0; + args->batch_normalization_5_beta = batch_normalization_5_beta; + args->batch_normalization_5_beta_bytes = 0; + args->batch_normalization_5_mean = batch_normalization_5_mean; + args->batch_normalization_5_mean_bytes = 0; + args->batch_normalization_5_variance = batch_normalization_5_variance; + args->batch_normalization_5_variance_bytes = 0; + args->depthwise_conv2d_3_w = depthwise_conv2d_3_w; + args->depthwise_conv2d_3_w_bytes = 0; + args->batch_normalization_6_gamma = batch_normalization_6_gamma; + args->batch_normalization_6_gamma_bytes = 0; + args->batch_normalization_6_beta = batch_normalization_6_beta; + args->batch_normalization_6_beta_bytes = 0; + args->batch_normalization_6_mean = batch_normalization_6_mean; + args->batch_normalization_6_mean_bytes = 0; + args->batch_normalization_6_variance = batch_normalization_6_variance; + args->batch_normalization_6_variance_bytes = 0; + args->conv2d_4_w = conv2d_4_w; + args->conv2d_4_w_bytes = 0; + args->batch_normalization_7_gamma = batch_normalization_7_gamma; + args->batch_normalization_7_gamma_bytes = 0; + args->batch_normalization_7_beta = batch_normalization_7_beta; + args->batch_normalization_7_beta_bytes = 0; + args->batch_normalization_7_mean = batch_normalization_7_mean; + args->batch_normalization_7_mean_bytes = 0; + args->batch_normalization_7_variance = batch_normalization_7_variance; + args->batch_normalization_7_variance_bytes = 0; + args->depthwise_conv2d_4_w = depthwise_conv2d_4_w; + args->depthwise_conv2d_4_w_bytes = 0; + args->batch_normalization_8_gamma = batch_normalization_8_gamma; + args->batch_normalization_8_gamma_bytes = 0; + args->batch_normalization_8_beta = batch_normalization_8_beta; + args->batch_normalization_8_beta_bytes = 0; + args->batch_normalization_8_mean = batch_normalization_8_mean; + args->batch_normalization_8_mean_bytes = 0; + args->batch_normalization_8_variance = batch_normalization_8_variance; + args->batch_normalization_8_variance_bytes = 0; + args->conv2d_5_w = conv2d_5_w; + args->conv2d_5_w_bytes = 0; + args->batch_normalization_9_gamma = batch_normalization_9_gamma; + args->batch_normalization_9_gamma_bytes = 0; + args->batch_normalization_9_beta = batch_normalization_9_beta; + args->batch_normalization_9_beta_bytes = 0; + args->batch_normalization_9_mean = batch_normalization_9_mean; + args->batch_normalization_9_mean_bytes = 0; + args->batch_normalization_9_variance = batch_normalization_9_variance; + args->batch_normalization_9_variance_bytes = 0; + args->depthwise_conv2d_5_w = depthwise_conv2d_5_w; + args->depthwise_conv2d_5_w_bytes = 0; + args->batch_normalization_10_gamma = batch_normalization_10_gamma; + args->batch_normalization_10_gamma_bytes = 0; + args->batch_normalization_10_beta = batch_normalization_10_beta; + args->batch_normalization_10_beta_bytes = 0; + args->batch_normalization_10_mean = batch_normalization_10_mean; + args->batch_normalization_10_mean_bytes = 0; + args->batch_normalization_10_variance = batch_normalization_10_variance; + args->batch_normalization_10_variance_bytes = 0; + args->conv2d_6_w = conv2d_6_w; + args->conv2d_6_w_bytes = 0; + args->batch_normalization_11_gamma = batch_normalization_11_gamma; + args->batch_normalization_11_gamma_bytes = 0; + args->batch_normalization_11_beta = batch_normalization_11_beta; + args->batch_normalization_11_beta_bytes = 0; + args->batch_normalization_11_mean = batch_normalization_11_mean; + args->batch_normalization_11_mean_bytes = 0; + args->batch_normalization_11_variance = batch_normalization_11_variance; + args->batch_normalization_11_variance_bytes = 0; + args->depthwise_conv2d_6_w = depthwise_conv2d_6_w; + args->depthwise_conv2d_6_w_bytes = 0; + args->batch_normalization_12_gamma = batch_normalization_12_gamma; + args->batch_normalization_12_gamma_bytes = 0; + args->batch_normalization_12_beta = batch_normalization_12_beta; + args->batch_normalization_12_beta_bytes = 0; + args->batch_normalization_12_mean = batch_normalization_12_mean; + args->batch_normalization_12_mean_bytes = 0; + args->batch_normalization_12_variance = batch_normalization_12_variance; + args->batch_normalization_12_variance_bytes = 0; + args->conv2d_7_w = conv2d_7_w; + args->conv2d_7_w_bytes = 0; + args->batch_normalization_13_gamma = batch_normalization_13_gamma; + args->batch_normalization_13_gamma_bytes = 0; + args->batch_normalization_13_beta = batch_normalization_13_beta; + args->batch_normalization_13_beta_bytes = 0; + args->batch_normalization_13_mean = batch_normalization_13_mean; + args->batch_normalization_13_mean_bytes = 0; + args->batch_normalization_13_variance = batch_normalization_13_variance; + args->batch_normalization_13_variance_bytes = 0; + args->dense_1_w = dense_1_w; + args->dense_1_w_bytes = 0; + args->dense_1_b = dense_1_b; + args->dense_1_b_bytes = 0; + + void* dfg = __visc__launch(0, root, (void*) args); + + __visc__wait(dfg); + + void *result = static_cast<RootIn*>(args)->input; + hpvm_request_tensor(result, 0); + + __visc__cleanup(); + computeAccuracy2(labels, 5000, result); + return 0; + +} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a7b14d7348f424556ba5e7bb52b6fdf9bbbd89c --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/quant_ranges_rt.txt @@ -0,0 +1,22 @@ +1 -0.5500815 0.60786617 -1.0248864 1.2929907 -0.36291853 0.2533059 0.0 0.753551840782 +2 0.0 0.753551840782 -0.69884616 0.71849966 -0.2781147 0.45571187 0.0 1.01057458043 +3 0.0 1.01057458043 -0.59568167 0.7714691 -0.8602873 0.19743633 -1.84771883726 1.87930787086 +4 0.0 2.33981014252 -0.41976976 0.43748936 -0.7021962 0.3033103 0.0 1.04317724705 +5 0.0 1.04317724705 -0.46757826 0.4635873 -0.20662616 0.1778044 -0.829483509064 0.786805033684 +6 0.0 2.49733686686 -0.64404047 0.45383143 -0.819547 0.38550296 0.0 0.897360802293 +7 0.0 0.897360802293 -0.41986948 0.33654243 -0.3563013 0.22371122 -0.957150224447 0.54919362247 +8 0.0 2.37362146616 -0.4805263 0.50655717 -0.296758 0.7742441 0.0 3.01592136621 +9 0.0 3.01592136621 -0.52083415 0.45517674 -0.20242067 0.8236838 -5.2759475708 5.79733039856 +10 0.0 2.37362146616 -0.5338656 1.3395424 -0.20242067 0.8236838 -0.738995380998 2.33600783587 +11 0.0 7.07933432579 -0.34429058 0.43629733 -1.0744808 0.056708273 0.0 1.58645607233 +12 0.0 1.58645607233 -0.30342352 0.39493486 -0.44630566 0.6492069 -1.49672914267 1.29970229745 +13 0.0 7.11914063454 -0.38351893 0.45775774 -1.4733055 -0.014426912 0.0 1.52876508832 +14 0.0 1.52876508832 -0.25695276 0.45372736 -0.5259744 0.26591402 -1.59576894164 1.08074297309 +15 0.0 6.94405080318 -0.55299705 0.5443531 -0.71790683 1.2730768 0.0 10.3651468277 +16 0.0 10.3651468277 -0.4203967 0.48641303 -0.90653443 1.3546854 -22.372925148 17.2033731079 +17 0.0 6.94405080318 -0.4365755 0.84913826 -0.90653443 1.3546851 -3.66810325861 4.87814051151 +18 0.0 18.8401451111 -0.38657624 0.5228989 -1.2083547 0.76361173 0.0 19.1229192352 +19 0.0 19.1229192352 -0.40857902 0.575035 -1.8731614 1.0960501 -31.3229312897 14.8234729958 +20 0.0 23.7382488823 -0.33079496 0.5893278 -1.0234511 1.0016295 0.0 19.5892774963 +21 0.0 19.5892774963 -0.27897888 0.38280907 -2.2086356 1.0066502 -34.4416886902 20.9890329933 +22 0.0 10.8541981602 -1.5092047 1.0279838 -0.49379802 0.61032647 -40.9121678543 25.7082381058 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..6307de9ab85096d6934a2772507d802859b5ceb9 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/tuner_confs_base.txt @@ -0,0 +1,90 @@ ++++++ +conf1 1 0 89.59 0 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 +3 gpu conv fp32 1 add fp32 1 +4 gpu add fp32 1 +5 gpu relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 +8 gpu add fp32 1 +9 gpu relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 +11 gpu conv fp32 1 add fp32 1 +12 gpu add fp32 1 +13 gpu relu fp32 1 +14 gpu conv fp32 1 add fp32 1 relu fp32 1 +15 gpu conv fp32 1 add fp32 1 +16 gpu conv fp32 1 add fp32 1 +17 gpu add fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 add fp32 1 relu fp32 1 +20 gpu conv fp32 1 add fp32 1 +21 gpu add fp32 1 +22 gpu relu fp32 1 +23 gpu conv fp32 1 add fp32 1 relu fp32 1 +24 gpu conv fp32 1 add fp32 1 +25 gpu add fp32 1 +26 gpu relu fp32 1 +27 gpu conv fp32 1 add fp32 1 relu fp32 1 +28 gpu conv fp32 1 add fp32 1 +29 gpu conv fp32 1 add fp32 1 +30 gpu add fp32 1 +31 gpu relu fp32 1 +32 gpu conv fp32 1 add fp32 1 relu fp32 1 +33 gpu conv fp32 1 add fp32 1 +34 gpu add fp32 1 +35 gpu relu fp32 1 +36 gpu conv fp32 1 add fp32 1 relu fp32 1 +37 gpu conv fp32 1 add fp32 1 +38 gpu add fp32 1 +39 gpu relu fp32 1 +40 gpu pool_mean fp32 1 +41 gpu mul fp32 1 add fp32 1 +42 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 89.59 0 +1 gpu conv fp16 1 add fp16 1 relu fp16 1 +2 gpu conv fp16 1 add fp16 1 relu fp16 1 +3 gpu conv fp16 1 add fp16 1 +4 gpu add fp16 1 +5 gpu relu fp16 1 +6 gpu conv fp16 1 add fp16 1 relu fp16 1 +7 gpu conv fp16 1 add fp16 1 +8 gpu add fp16 1 +9 gpu relu fp16 1 +10 gpu conv fp16 1 add fp16 1 relu fp16 1 +11 gpu conv fp16 1 add fp16 1 +12 gpu add fp16 1 +13 gpu relu fp16 1 +14 gpu conv fp16 1 add fp16 1 relu fp16 1 +15 gpu conv fp16 1 add fp16 1 +16 gpu conv fp16 1 add fp16 1 +17 gpu add fp16 1 +18 gpu relu fp16 1 +19 gpu conv fp16 1 add fp16 1 relu fp16 1 +20 gpu conv fp16 1 add fp16 1 +21 gpu add fp16 1 +22 gpu relu fp16 1 +23 gpu conv fp16 1 add fp16 1 relu fp16 1 +24 gpu conv fp16 1 add fp16 1 +25 gpu add fp16 1 +26 gpu relu fp16 1 +27 gpu conv fp16 1 add fp16 1 relu fp16 1 +28 gpu conv fp16 1 add fp16 1 +29 gpu conv fp16 1 add fp16 1 +30 gpu add fp16 1 +31 gpu relu fp16 1 +32 gpu conv fp16 1 add fp16 1 relu fp16 1 +33 gpu conv fp16 1 add fp16 1 +34 gpu add fp16 1 +35 gpu relu fp16 1 +36 gpu conv fp16 1 add fp16 1 relu fp16 1 +37 gpu conv fp16 1 add fp16 1 +38 gpu add fp16 1 +39 gpu relu fp16 1 +40 gpu pool_mean fp16 1 +41 gpu mul fp16 1 add fp16 1 +42 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt new file mode 100644 index 0000000000000000000000000000000000000000..19f5523523f3b9fc7b8f81c69112630003d5597e --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/quant_ranges_rt.txt @@ -0,0 +1,15 @@ +1 -1.8816367 2.0934217 -0.53275156 0.49437004 -0.6403629 0.2490165 0.0 1.35908746719 +2 0.0 1.35908746719 -0.2688396 0.20639156 -0.7745511 0.82006615 0.0 2.52123117924 +3 0.0 2.52123117924 -0.16776876 0.14878987 -0.35283303 0.5154362 0.0 1.20119857848 +4 0.0 1.20119857848 -0.088948585 0.114222586 -0.30250227 0.36856708 0.0 1.03598809302 +5 0.0 1.03598809302 -0.07739562 0.10973293 -0.15568458 0.17634983 0.0 0.300495595038 +6 0.0 0.300495595038 -0.051649556 0.05435231 -0.07395447 0.07996062 0.0 0.11490475405 +7 0.0 0.11490475405 -0.043513633 0.07577866 -0.06921874 0.02660573 0.0 0.16232508488 +8 0.0 0.16232508488 -0.033842053 0.045218028 -0.022827804 0.023845317 0.0 0.124249965735 +9 0.0 0.124249965735 -0.02211613 0.032084666 -0.02699063 0.03773564 0.0 0.174634486511 +10 0.0 0.174634486511 -0.01979376 0.034854397 -0.036107242 0.07056531 0.0 0.575175762177 +11 0.0 0.575175762177 -0.03452098 0.046055835 -0.051925894 0.07039055 0.0 0.771875114441 +12 0.0 0.771875114441 -0.025946895 0.040090334 -0.06049362 0.12658806 0.0 1.17285169065 +13 0.0 1.17285169065 -0.021766115 0.03315237 -0.20705001 0.117947325 0.0 2.00157693863 +14 0.0 2.00157693863 -0.042597745 0.046707444 -0.21937433 0.2545502 0.0 2.00236111879 +15 0.0 2.00236111879 -0.32550547 0.30829763 -1.1787822 1.2378151 -18.2514705467 24.1736344528 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9a6612a5df150f58c69e1a7faeaf83ed5c7d605 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/tuner_confs_base.txt @@ -0,0 +1,38 @@ ++++++ +conf1 1 0 90.19 0 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 1 add fp32 1 relu fp32 1 +4 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 1 add fp32 1 relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 1 add fp32 1 relu fp32 1 +9 gpu conv fp32 1 add fp32 1 relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 1 add fp32 1 relu fp32 1 +12 gpu conv fp32 1 add fp32 1 relu fp32 1 +13 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 1 add fp32 1 relu fp32 1 +15 gpu mul fp32 1 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 1.5 0 90.19 0 +1 gpu conv fp16 1 add fp16 1 relu fp16 1 +2 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 1 add fp16 1 relu fp16 1 +4 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 1 add fp16 1 relu fp16 1 +6 gpu conv fp16 1 add fp16 1 relu fp16 1 +7 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 1 add fp16 1 relu fp16 1 +9 gpu conv fp16 1 add fp16 1 relu fp16 1 +10 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 1 add fp16 1 relu fp16 1 +12 gpu conv fp16 1 add fp16 1 relu fp16 1 +13 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 1 add fp16 1 relu fp16 1 +15 gpu mul fp16 1 add fp16 1 +16 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h index 095b3d6f89a670b16467db4d0e695adb2fe207d6..55f16e4d8d176e1709e2c6525c3cd47b2bb8da1c 100644 --- a/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h +++ b/llvm/test/VISC/DNN_Benchmarks/common/include/visc.h @@ -96,6 +96,7 @@ void* __visc__tensor_add(void*, void*); void* __visc__tensor_mul(void*, void*); void* __visc__tensor_convolution(void*, void*, int, int, int, int); void* __visc__tensor_group_convolution(void*, void*, int, int, int, int, int, int); +void* __visc__tensor_batchnorm(void*, void*, void*, void*, void*, double); void* __visc__tensor_pool_max(void*, int, int, int, int, int, int); void* __visc__tensor_pool_mean(void*, int, int, int, int, int, int); void* __visc__tensor_relu(void*);