diff --git a/llvm/projects/hpvm-tensor-rt/README.md b/llvm/projects/hpvm-tensor-rt/README.md index c243c6dd1662083021a642b9a088fa55f5d1ed3d..5b3e5f99d39cf5c697051fa2580eb74f207bb031 100644 --- a/llvm/projects/hpvm-tensor-rt/README.md +++ b/llvm/projects/hpvm-tensor-rt/README.md @@ -6,10 +6,25 @@ * CUDA-9.0 or above * CUBLAS-9.0 or above - often included with cuda-toolkit +## Dependent Library Builds + +```shell +cd ../gpu_profiler +mkdir lib +cmake ../ +make + +cd ../soc_simulator +mkdir lib +cmake ../ +make +``` + + ## BUILD ```shell -source bin/setup_runtime_paths.sh +source bin/setup_cuda_llvm_paths.sh mkdir build cd build cmake ../ diff --git a/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh new file mode 100644 index 0000000000000000000000000000000000000000..3548f182f198724600aee855b66169a1bdf12a3a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/setup_cuda_llvm_paths.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# CUDNN Path setup +module load cuda-toolkit/9.1 +export CUDA_INCLUDE_PATH=/software/cuda-9.1/include +export CUDNN_PATH=/software/cuda-9.1/lib64/ +export LIBRARY_PATH=/software/cuda-9.1/lib64/:$LIBRARY_PATH +export LD_LIBRARY_PATH=/software/cuda-9.1/lib64/:$LD_LIBRARY_PATH + +# HPVM Path setup +export CPATH=$CPATH:/home/hsharif3/anaconda2/include/ +export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH +export LLVM_BUILD_ROOT=/home/hsharif3/Gitlab/hpvm/build/ +export LLVM_SRC_ROOT=/home/hsharif3/Gitlab/hpvm/llvm/ diff --git a/llvm/projects/hpvm-tensor-rt/bin/setup_jetson.sh b/llvm/projects/hpvm-tensor-rt/bin/setup_jetson.sh index 2ad4d5bc765e23841b87ebdcb778295456c2b380..b288ccfe43c577f9ad14c4eb16284539ae5682ea 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/setup_jetson.sh +++ b/llvm/projects/hpvm-tensor-rt/bin/setup_jetson.sh @@ -1,3 +1,8 @@ export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda-9.0/targets/aarch64-linux/lib/ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-9.0/targets/aarch64-linux/lib/ +export CUDNN_PATH=/usr/local/cuda-9.0/ +export CUDA_INCLUDE_PATH=${CUDNN_PATH}/include + +export LLVM_BUILD_ROOT=/home/nvidia/Gitlab/hpvm/build/ +export LLVM_SRC_ROOT=/home/nvidia/Gitlab/hpvm/llvm/ diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py index 2c57eaf5be7c09a05859221535a7aff709330fcf..e3b94082f5be7b83a1598625afd5ef05a0472506 100644 --- a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py +++ b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py @@ -65,7 +65,7 @@ class TableGenerator: 3. Writes the internal table to <network_name>_tensors.txt file and uses the <network_name>_ops.txt file as a guideline in terms of row order ''' - #self.__run_inputted_binaries() + self.__run_inputted_binaries() self.__build_internal_table() self.__output_table_to_file() diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/measure_confidence2.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/measure_confidence2.py index b38efa9c82a1da4440fe4653b72b1beb89032a5f..9ff74128f4e3a21545c9b7658638d4e44b758cbd 100644 --- a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/measure_confidence2.py +++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/measure_confidence2.py @@ -83,7 +83,23 @@ def getConfidence(accuracy_outfile, acc_threshold): return conf, avg_acc - + + +def getMinAccuracy(accuracy_outfile): + + f = open(accuracy_outfile, "r") + + total_acc = 0.0 + failed = 0 + it = 0 + + acc_list = [] + for x in f: + acc = float(x.strip()) + acc_list.append(acc) + + return min(acc_list) + # NOTE: invokes the binary with the number of runs def do_multiple_runs2(binary_name, accuracy_threshold, confidence_threshold): diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/pareto_curve.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/pareto_curve.py index 0fda8f742cc0ef75e4b84232f397872b04554dd6..db8233994b855317095c94331fba869d9ad79d16 100644 --- a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/pareto_curve.py +++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/pareto_curve.py @@ -5,6 +5,9 @@ import shutil from measure_confidence2 import getConfigCost +AL_THRESHOLD = 0.1 + + class Config: def __init__(self): self.avg_accuracy = 0 @@ -69,10 +72,6 @@ def loadConfigData(result_dir, layer_costs, baseline_accuracy): -AL_THRESHOLD = 0.1 -SPEEDUP_BAND_SIZE = 0.3 -ENERGY_BAND_SIZE = 10 - class Configuration: def __init__(self, name, speedup, energy, accuracy, accuracy_loss): @@ -223,11 +222,17 @@ def findParetoConfigs(base_dir, layer_costs, accuracy): config = Configuration(config.fname , config.speedup, 100, config.avg_accuracy, config.avg_loss) config_list.append(config) + + SPEEDUP_BAND_SIZE = 1.0 + ENERGY_BAND_SIZE = 10 - if len(config_list) < 30: - SPEEDUP_BAND_SIZE = 1.2 - + # No Pareto Selection if list is < 50 configurations + if len(config_list) < 50: + SPEEDUP_BAND_SIZE = 100 # Include all in Pareto Frontier + + print ("*SPEEDUP_BAND_SIZE = ", SPEEDUP_BAND_SIZE) + ASC, AEC = compute_pareto_points_with_margin(config_list, SPEEDUP_BAND_SIZE, ENERGY_BAND_SIZE) diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/promise_tuner3.py b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/promise_tuner3.py index 87ed35bbc4bcac6288c30454ba1d650956dd9118..04ce0d6158819d5cb014411456e1a985fb17b354 100644 --- a/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/promise_tuner3.py +++ b/llvm/projects/hpvm-tensor-rt/opentuner/autotuner/promise_tuner3.py @@ -22,6 +22,7 @@ import threading import psutil from measure_confidence2 import dump_promise_confidence_files3 +from measure_confidence2 import getConfidence, getMinAccuracy from select_top_results import select_top_results from time import sleep from pareto_curve import findParetoConfigs @@ -169,25 +170,36 @@ class ClangFlagsTuner(MeasurementInterface): createFlagsFile("promise_flags", cfg) run_cmd = binary_name - print "binary_name = ", run_cmd + print "\nbinary_name = ", run_cmd #run_result_call_program = self.call_program(run_cmd) - #print "returned \n\n" + + total_runs = 2 FNULL = open(os.devnull, 'wb') - p = subprocess.Popen(run_cmd, stdout = FNULL) + #p = subprocess.Popen(run_cmd, stdout = FNULL) + p = subprocess.Popen([run_cmd, str(total_runs)], stdout = FNULL) p.wait() accuracy = getAccuracy("final_accuracy") + + # Get Confidence for multiple runs + conf, avg_acc = getConfidence("run_accuracies.txt", accuracy_threshold) + # getConfigCost returns the cost associated with the selected configuration total_comps = getConfigCost(cfg) Result = opentuner.resultsdb.models.Result() Result.time = total_comps - Result.accuracy = accuracy - - if accuracy > accuracy_threshold: + #Result.accuracy = accuracy + min_accuracy = getMinAccuracy("run_accuracies.txt") + print ("min_accuracy = ", min_accuracy) + Result.accuracy = min_accuracy + + # Only pass conf if conf == 100 + if min_accuracy > accuracy_threshold and conf == 100: + print ("conf = ", conf, " avg_acc = ", avg_acc) #if accuracy not in evaluated_configs: config_tuple = (total_comps, accuracy, cfg) self.configs_list.append(config_tuple) @@ -199,8 +211,6 @@ class ClangFlagsTuner(MeasurementInterface): f_acc.close() - print "done with one run" - test_id += 1 return Result diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc index 1b4dd03b25352290198178fba7bd35590d5fe0cc..e7784cb18e7e7c766f0ae27e6588d4851a2f2812 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc @@ -7,6 +7,9 @@ void dummyFunction(){ void* initRT = (void*) &llvm_hpvm_initTensorRt; void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt; + void* initApproxRT = (void*) &llvm_hpvm_initApproxhpvmRt; + void* cleanApproxRT = (void*) &llvm_hpvm_cleanupApproxhpvmRt; + void* initRTController = (void*) &llvm_hpvm_initializeRuntimeController; void* cleanRTController = (void*) &llvm_hpvm_clearRuntimeController; diff --git a/llvm/projects/soc_simulator/src/driver_new_config.py b/llvm/projects/soc_simulator/src/driver_new_config.py new file mode 100644 index 0000000000000000000000000000000000000000..115237dac51c96b47d02c84a603d98bdcf0b84a4 --- /dev/null +++ b/llvm/projects/soc_simulator/src/driver_new_config.py @@ -0,0 +1,328 @@ +from collections import defaultdict +import os +import subprocess +import sys + +class Driver: + fp16_swing = 8 + + class ApproxTypes: + FP16 = 0 + FP32 = 1 + PROMISE = 2 + PERF = 3 + + results_time_key = "Time" + results_energy_key = "Energy" + + + def driver(self): + self.__parse_tensor_layer_file() + self.__parse_tensor_table() + self.__run_simulations() + self.__display_results() + + + def __init__(self, layer_filename, table_filename, config_filename, results_filename): + self.__layer_filename = layer_filename + self.__table_filename = table_filename + self.__config_filename = config_filename + self.__results_filename = results_filename + + # NOTE: Use an OrderedDict if we want to search by operation name + # Using a list bc we care about the order the data is read in + # since it corresponds to the data in the configuration file + self.__tensor_layers = [] + + # [layer_name][operation_name][cols] + # Operation names need to be stored in order of insertion + self.__tensor_table = defaultdict(lambda: list(defaultdict(str))) + + # [Time/Energy][number corresponding to order the layer config was read in] = time/energy + self.__aggregate_results = defaultdict(lambda: defaultdict(float)) + self.__config_count = 0 + + + @staticmethod + def is_conv(operation_name): + return operation_name.startswith("Conv") + + + @staticmethod + def is_nml(operation_name): + return operation_name.startswith("NML") + + + @staticmethod + def is_fc(operation_name): + return operation_name.startswith("FC") + + + def __parse_tensor_layer_file(self): + if not os.path.isfile(self.__layer_filename): + print("ERROR: %s was not found." % self.__layer_filename) + exit(1) + + layer_file = open(self.__layer_filename, "r") + for line in layer_file: + layer_data = line.strip().split(',') + layer_name = layer_data[0] + + tensor_layer = defaultdict(str) + tensor_layer["Name"] = layer_name + + if Driver.is_conv(layer_name): + tensor_layer["N"] = float(layer_data[1]) + tensor_layer["Cin"] = float(layer_data[2]) + tensor_layer["H"] = float(layer_data[3]) + tensor_layer["W"] = float(layer_data[4]) + tensor_layer["Cout"] = float(layer_data[5]) + tensor_layer["Kh"] = float(layer_data[7]) + tensor_layer["Kw"] = float(layer_data[8]) + tensor_layer["Sh"] = float(layer_data[9]) + tensor_layer["Sw"] = float(layer_data[10]) + + elif Driver.is_fc(layer_name): + tensor_layer["RA"] = float(layer_data[1]) + tensor_layer["CA"] = float(layer_data[2]) + tensor_layer["RB"] = float(layer_data[3]) + tensor_layer["CB"] = float(layer_data[4]) + + elif not Driver.is_nml(layer_name): # TODO should we store data for NMLs? + print("ERROR: Invalid layer name %s" % layer_name) + exit(1) + + self.__tensor_layers.append(tensor_layer) + layer_file.close() + + + def __parse_tensor_table(self): + if not os.path.isfile(self.__table_filename): + print("ERROR: %s was not found." % self.__table_filename) + exit(1) + table_file = open(self.__table_filename, "r") + line = table_file.readline().strip() + + while line: + # Line here MUST be a header or there's a bug + # Get the description of the layer + assert(line.startswith("**")) + + header_contents = line.split(' ')[1:] + layer_name = header_contents[0] + num_ops = int(header_contents[1]) + col_names = header_contents[2:] + + layer_operations = [] + + # Go through all operations in the layer + for op_count in range(num_ops): + operation_data = defaultdict(str) + + line = table_file.readline().strip() + op_data = line.split(' ') + op_name = op_data[0] + operation_data["Name"] = op_name + + # Number of data items (#s) needs to match up with the # of cols + assert(len(op_data) - 1 == len(col_names)) + + # Go through all data items (each col element) per operation + for i in range(len(col_names)): + operation_data[col_names[i]] = float(op_data[i + 1]) + + layer_operations.append(operation_data) + + self.__tensor_table[layer_name] = layer_operations + line = table_file.readline().strip() + table_file.close() + + + @staticmethod + def is_promise(config_layer): + return float(config_layer.split(' ')[0]) < Driver.fp16_swing + + + def __quantize(self, curr_layer, prev_layer, h2f_f2h_operation_ind, layer_data): + if curr_layer == prev_layer or curr_layer == Driver.ApproxTypes.PROMISE \ + or prev_layer == Driver.ApproxTypes.PROMISE: # No quantization needed + return 0.0, 0.0 + + layer_name = layer_data["Name"] + + # NOTE: Ignoring logic where curr == promise or prev == promise bc + # smartDMA is always true so we'd return near the beginning of the method + + # Get h2f/f2h data using the first tensor operation in the layer + # (which is why order matters in the tensor table) + print(layer_name, self.__tensor_table[layer_name]) + tensor_op_row = self.__tensor_table[layer_name][h2f_f2h_operation_ind] + if curr_layer == Driver.ApproxTypes.FP32: + time = tensor_op_row["h2f_time"] + energy = tensor_op_row["h2f_energy"] + elif curr_layer == Driver.ApproxTypes.FP16: + time = tensor_op_row["f2h_time"] + energy = tensor_op_row["f2h_energy"] + + print("Quantization: (%f, %f)" % (time, energy)) + return (time, energy) + + + def __run_promise_simulation(self, swing, layer_data): + layer_name = layer_data["Name"] + patch_factor = 1 + + if Driver.is_conv(layer_name): + rows_a = layer_data["N"] * layer_data["H"] * layer_data["W"] \ + / (layer_data["Sh"] * layer_data["Sw"]) + cols_a = layer_data["Cin"] * layer_data["Kh"] * layer_data["Kw"] + rows_b = cols_a + cols_b = layer_data["Cout"] + patch_factor = layer_data["Kh"] * layer_data["Kw"] + elif Driver.is_fc(layer_name): + rows_a = layer_data["RA"] + cols_a = layer_data["CA"] + rows_b = cols_a + cols_b = layer_data["CB"] + else: + print("PROMISE can't run whatever this layer is.") + exit(1) + # Run promise simulator + # TODO need to print time and energy in the ptm runner so we can pipe it + output = subprocess.Popen(["./ptm", str(rows_a), str(cols_a), str(rows_b), \ + str(cols_b), str(patch_factor), str(swing)], \ + stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0] + total_time_energy = output.strip().split(',') + + assert(len(total_time_energy) == 2) + print("PROMISE: (%s, %s)" % (total_time_energy[0], total_time_energy[1])) + return float(total_time_energy[0]), float(total_time_energy[1]) + + + def __run_simulations(self): + if not os.path.isfile(self.__config_filename): + print("ERROR: %s was not found" % self.__config_filename) + exit(1) + + config_file = open(self.__config_filename, "r") + + line = config_file.readline().strip() + + while line: + assert(line.startswith("+++++")) + config_name = config_file.readline().strip().split(' ')[0] # Next line = configuration name + print("CONFIGURATION") + + line = config_file.readline().strip() + layer_ind = 0 # NOTE can also use the leftmost number in the currl ine + + prev_layer = Driver.ApproxTypes.FP32 + curr_layer = None + + while not line.startswith("-----"): + layer_info = line.split(' ') + layer_data = self.__tensor_layers[layer_ind] + layer_name = layer_data["Name"] + + if layer_info[1] == "promise": + print("Running layer %s on PROMISE" % layer_name) + curr_layer = Driver.ApproxTypes.PROMISE + + swing = int(layer_info[3]) + time, energy = self.__run_promise_simulation(swing, layer_data) + print(time, energy) + self.__aggregate_results[Driver.results_time_key][self.__config_count] += time + self.__aggregate_results[Driver.results_energy_key][self.__config_count] += energy + + elif layer_info[1] == "gpu": + # Parse each individual tensor operation + # TODO not portable bc there can be multiple numbers after each approx later on + total_time = 0 + total_energy = 0 + + tensor_ind = 0 + for i in range(2, len(layer_info), 3): + tensor_op = layer_info[i] + approx_type = layer_info[i + 1] + approx_num = layer_info[i + 2] # only matters if perf + + if approx_type == "fp16": + curr_layer = Driver.ApproxTypes.FP16 + elif approx_type == "fp32": + curr_layer = Driver.ApproxTypes.FP32 + elif approx_type == "perf": + curr_layer = DriverApproxTypes.PERF + else: + assert(False) + + quant_time, quant_energy = self.__quantize(curr_layer, prev_layer, tensor_ind, layer_data) + time, energy = self.__run_gpu_simulation(curr_layer, layer_name, tensor_ind, approx_num) + total_time += time + total_energy += energy + + tensor_ind += 1 + + self.__aggregate_results[Driver.results_time_key][self.__config_count] += total_time + self.__aggregate_results[Driver.results_energy_key][self.__config_count] += total_energy + + layer_ind += 1 + line = config_file.readline().strip() + + self.__config_count += 1 + line = config_file.readline().strip() + + config_file.close() + + + def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, approx_num): + tensor_info = self.__tensor_table[layer_name][tensor_ind] + + if curr_layer == Driver.ApproxTypes.FP32: + time = tensor_info["fp32_time"] + energy = tensor_info["fp32_energy"] + + elif curr_layer == Driver.ApproxTypes.FP16: + time = tensor_info["fp16_time"] + energy = tensor_info["fp16_energy"] + + elif curr_layer == Driver.ApproxTypes.PERF: + time = tensor_info["perf%s_energy" % approx_num] + energy = tensor_info["perf%s_energy" % approx_num] + + print("GPU: (%f, %f)" % (time, energy)) + return time, energy + + + def __display_results(self): + results_file = open(self.__results_filename, "w") + attributes_to_print = [Driver.results_time_key, Driver.results_energy_key] + + for attribute in attributes_to_print: + results_file.write("%s\n" % attribute) + results_file.write("Configuration,Total,Improvement\n") + + baseline_val = self.__aggregate_results[attribute][0] + print(baseline_val) + best_config = None + best_result = None + + for config_ind in range(self.__config_count): + results_file.write("c%d" % config_ind) + time_or_energy_val = self.__aggregate_results[attribute][config_ind] + + # Using repr to keep all decimal digits when writing to file + results_file.write(",%s" % repr(time_or_energy_val)) + results_file.write(",%s\n" % repr(baseline_val / (time_or_energy_val + 0.0001))) + + if not best_result or time_or_energy_val < best_result: + best_result = time_or_energy_val + best_config = config_ind + results_file.write("\nc%d,%s\n\n" % (best_config, repr(self.__aggregate_results[attribute][best_config]))) + results_file.close() + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("Usage: python driver.py <layer info> <tensor info> <configurations> <results file>") + exit(1) + Driver(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]).driver()