diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp index 7ea0c1dce23cf94385df3089c499338bec281b64..c54dd9ef3bbf6fbd8e75cbb4bc488cb215e580ce 100644 --- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -857,7 +857,7 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(7)); // Create wrapper API runtime function call - Constant* wrapper_tensorGroupConvolution; + Constant* wrapper_tensorGroupConvolution = M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"), RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution, @@ -895,9 +895,9 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(3)); Args.push_back(TensorII->getOperand(4)); Args.push_back(TensorII->getOperand(5)); - + // Create wrapper API runtime function call - Constant* wrapper_tensorBatchNorm; + Constant* wrapper_tensorBatchNorm = M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"), RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm, diff --git a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh index cde03bd6d0ffa9969c785e17fe2f708c75396158..33a54cd0de626113e5cf11e2f6a6928d4fa384eb 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh +++ b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh @@ -3,11 +3,9 @@ export HPVM_TENSOR_RT_HOME=/home/hsharif3/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/ export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH -clang++ -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc +clang++ -I/software/cuda-9.1/include -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc llvm-dis --version llvm-dis ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc -cp ${HPVM_TENSOR_RT_HOME}/build/libtensor_runtime.a ${HPVM_TENSOR_RT_HOME}/lib/libtensor_runtime.a -cp ${HPVM_TENSOR_RT_HOME}/build_autotuner/libtensor_runtime.a ${HPVM_TENSOR_RT_HOME}/lib/libtensor_autotuner.a diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py deleted file mode 100644 index e3b94082f5be7b83a1598625afd5ef05a0472506..0000000000000000000000000000000000000000 --- a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py +++ /dev/null @@ -1,304 +0,0 @@ -import glob -import os -import subprocess -import shutil -import sys - -from collections import defaultdict - -''' -FORMAT - -** LayerName NumOpsInLayer <cols> -OpName Col1Val Col2Val ... - -** Conv1 1 h2f_time h2f_energy fp32_time fp32_energy f2h_time f2h_energy fp16_perf_time fp16_perf_energy fp16_time fp16_energy -Conv1 51.8808 97.2844 319.582 601.966 12.81 18.758 388.092 650.649 340.037 590.664 - -''' - -class TableGenerator: - - __ops_header_delimiter = "#" - __table_header_delimter = "**" - __time_col_name = "time" - __energy_col_name = "energy" - - ''' - Stores all precision conversions used. - ''' - precision_conversions = frozenset(["h2f", "f2h"]) - - def __init__(self, dir_path, iters, profiler_binary_name): - ''' - Args: - dir_path: Path of directory containing network binaries - iters: Number of iterations to run each binary for - profiler_binary_name: Name of offline profiler binary to run - ''' - self.__dir_path = dir_path - - # Name of the actual directory - self.__network_name = os.path.split(dir_path)[-1] - - self.__iters = iters - self.__profiler_binary_name = profiler_binary_name - - # Path to results directory - self.__results_dir_path = "%s_results" % self.__dir_path - - # Outputted table file - self.__table_filename = "%s_tensors.txt" % self.__network_name - - # Nested default dictionary of default dicts - self.__table = self.__build_nested_default_dict() - - - def generate_table(self): - ''' - Generates a table file called <network_name>_tensors.txt in the following - steps: - 1. Runs the offline profiler against the inputted binaries to generate - results files - 2. Builds an internal table storing all data from the parsed results files - the offline profiler generated - 3. Writes the internal table to <network_name>_tensors.txt file and uses the - <network_name>_ops.txt file as a guideline in terms of row order - ''' - self.__run_inputted_binaries() - self.__build_internal_table() - self.__output_table_to_file() - - - def __run_inputted_binaries(self): - ''' - Invokes the profiler to run all appropriate binaries (must start with the network - name) in the inputted directory. Result files generated by the profiler are - stored in the results file directory and are named <binary_name>.txt. These results - files are then parsed in a later step to generate the table - ''' - if not os.path.isdir(self.__dir_path): - print("ERROR: Directory %s not found" % self.__dir_path) - exit(1) - - try: - os.mkdir(self.__results_dir_path) - except OSError: - if os.path.isdir(self.__results_dir_path): - print("Directory already exists. Clearing directory.") - for old_file in glob.glob(os.path.join(self.__results_dir_path, "*")): - os.remove(old_file) - else: - print("ERROR: Directory doesn't exist but failed to create dir") - - for binary_name in os.listdir(self.__dir_path): - binary_path = os.path.join(self.__dir_path, binary_name) - - if not self.__should_execute_file(binary_path): - continue - - output_file = os.path.join(self.__results_dir_path, binary_name + ".txt") - # No stdout/stderr piping needed for now - subprocess.Popen([profiler_binary_name, binary_path, str(self.__iters), \ - output_file]).communicate() - - - def __build_internal_table(self): - ''' - Iterates through each results file generated by the runs of the offline - profiler and stores the data in a dictionary in the following format: - [operation name][approximation type OR conversion type][time/energy] - ''' - for results_file_name in os.listdir(self.__results_dir_path): - # Ignore if it's not a results file - if results_file_name == self.__table_filename or \ - not results_file_name.startswith(self.__network_name): - continue - - approx_type = self.__get_approximation_type(results_file_name) - results_file = open(os.path.join(self.__results_dir_path, results_file_name), "r") - - for line in results_file: - line = line.strip() - op_name, total_time, total_energy = self.__parse_tensor_operation_line(line) - - # If the current operation is f2h or h2f - if any(op_name.endswith(prec_conv) for prec_conv in TableGenerator.precision_conversions): - # Get the original operation name (without the f2h/h2f) and the conversion type - orig_op_name, conversion_type = self.__get_original_operation_name(op_name) - - if orig_op_name not in self.__table: - print("ERROR: Conversion found but original %s is not in the table" % orig_op_name) - exit(1) - - # Store f2h and h2f as columns in the row belonging to the original operation - self.__table[orig_op_name][conversion_type][TableGenerator.__time_col_name] = total_time - self.__table[orig_op_name][conversion_type][TableGenerator.__energy_col_name] = total_energy - - # Create a new row in the dictionary - else: - self.__table[op_name][approx_type][TableGenerator.__time_col_name] = total_time - self.__table[op_name][approx_type][TableGenerator.__energy_col_name] = total_energy - results_file.close() - - - def __output_table_to_file(self): - ''' - Outputs the internally stored table to a file using the <network_name>_ops.txt file as - a guideline in the following steps: - 1. Opens the ops file and the file to output the table to - 2. Reads a line from the ops file (guaranteed to be the layers/NML header) - 3. For each operation in the layer (or 1 operation if the "layer" is a NML), we store the - time and the energy - ''' - table_file_path = os.path.join(self.__results_dir_path, self.__table_filename) - soc_operations_file_name = os.path.join("/", "home", "nvidia", "soc_simulator", \ - "%s_cifar10" % self.__network_name, "%s_ops.txt" % self.__network_name) - - soc_operations_file = open(soc_operations_file_name, "r") - table_file = open(table_file_path, "w") - - curr_line = soc_operations_file.readline().strip() - - while curr_line: - # First line is always the layers line (#layer_name,num_ops) - layer_name, num_ops = self.__parse_layer_info_line(curr_line) - - # List of strings, where each string is a row corresponding to an operation - # in the layer - ops_in_layer = [] - - # Stores a list of elements in the header, which will be joined into a string - # The header is only generated for the first operation in the layer - # CRITICAL ASSUMPTION: All operations within a layer have the same # columns - # or everything breaks bc the header is per layer, not per operation - header = [TableGenerator.__table_header_delimter, layer_name, str(num_ops)] - - # Iterate through all operations within the layer - for op_in_layer_count in range(num_ops): - # Contains the operation name - curr_line = soc_operations_file.readline().strip() - - # Stores a list of elements that will be joined to make up a row - curr_op = [curr_line] - operation_data = self.__table[curr_line] - - # Iterate through time/energy data for each approximation type corresponding - # to the current operation - for approx_type in operation_data: - op_time = operation_data[approx_type][TableGenerator.__time_col_name] - op_energy = operation_data[approx_type][TableGenerator.__energy_col_name] - - curr_op.append(op_time) - curr_op.append(op_energy) - - if op_in_layer_count == 0: - header.append("%s_time" % approx_type) - header.append("%s_energy" % approx_type) - - ops_in_layer.append(' '.join(curr_op)) - - # Getting all operation rows and then writing everything because - # calls to write() are slow (memory vs time tradeoff) - table_file.write("%s\n%s\n" % (' '.join(header), '\n'.join(ops_in_layer))) - - curr_line = soc_operations_file.readline().strip() - - - def __should_execute_file(self, file_path): - ''' - Checks if the file at the given file path is a binary that should be run - by the profiler. Must exist, be a binary, and must start with the network - name as per our naming standards. - - Args: - file_path: Path of the file to check - ''' - return os.path.isfile(file_path) and os.access(file_path, os.X_OK) and \ - file_path.find(self.__network_name) != -1 - - - def __get_approximation_type(self, results_filename): - ''' - Parses a given results filename for the approximation type. - Format assumption: <network_name>_<approx_type>.txt - - Args: - results_filename: Name of results file - - Returns: - the approximation technique (ex: fp16) - ''' - approx_type_start_ind = results_filename.find(self.__network_name) \ - + len(self.__network_name) + 1 # + 1 to account for _ delimiter - approx_type_end_ind = results_filename.find(".txt") - return results_filename[approx_type_start_ind : approx_type_end_ind] - - - def __parse_tensor_operation_line(self, tensor_op_line): - ''' - Parses a tensor operation line (within a output file from the offline - profiler for the operation name, the total time used, and the total - energy used - - Args: - tensor_op_line: Tensor operation line from output file - - Returns: - operation name - total time used - total energy used - ''' - line_as_list = tensor_op_line.split(",") - return line_as_list[0], line_as_list[1], line_as_list[2] - - - def __build_nested_default_dict(self): - ''' - Builds a nested default dictionary with an arbitrary number of levels - ''' - return defaultdict(self.__build_nested_default_dict) - - def __get_original_operation_name(self, op_name): - ''' - Parses an operation name containing _<conversion type> for the original - operation name. - Format assumption: <original_op_name>_<conversion type> - - Args: - op_name: Name of the operation - - Returns: - the original operation name - ''' - underscore_ind = op_name.find("_") - return op_name[ : underscore_ind], op_name[underscore_ind + 1 : ] - - - def __parse_layer_info_line(self, layer_info_line): #layer_name,num_ops - ''' - Parses a layer header (from the original ops.txt file) into the layer name - and the number of operations - Assumed format: #layer_name,num_ops - - Args: - layer_info_line: Line at the beginning of each layer in the ops file - - Returns: - layer name - number of ops in the layer - ''' - comma_ind = layer_info_line.find(",") - return layer_info_line[layer_info_line.find(TableGenerator.__ops_header_delimiter) + 1 : comma_ind], \ - int(layer_info_line[comma_ind + 1 : ]) - - -if __name__ == "__main__": - if len(sys.argv) != 4: - print("python table_generator.py <binary dir path> <num itrs> <profiler bin path>") - exit(1) - binary_dir_path = sys.argv[1] - num_iters = int(sys.argv[2]) - profiler_binary_name = sys.argv[3] - table_gen = TableGenerator(binary_dir_path, num_iters, profiler_binary_name) - table_gen.generate_table() diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py index 5f5c28032d721dcf1e77ab52407a165c0251deb2..f1f00f4e285fbf487fee03bfee72dbe1a84ea55a 100644 --- a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py @@ -27,17 +27,37 @@ def parse_binary_output(proc_output): return avg_time -# Input: a list of tuples of benchmark names -# Can change to input a file containing benchmarks to run -def run_benchmarks(builds_dir, output_filename, should_print_bin_output = True): - output_file = open(output_filename, "w") +def get_sorted_binaries(builds_dir): + # dict of network names to lists of binaries + # list of binaries should be in sorted order (can do that when we run the benchmarks) + network_bins = defaultdict(list) for bin_name in os.listdir(builds_dir): if bin_name.find("profiling") == -1: continue - output_file.write("%s: %s\n" % (bin_name, \ + network_name = bin_name[ : bin_name.rfind("_")] + network_bins[network_name].append(bin_name) + return network_bins + +# Input: a list of tuples of benchmark names +# Can change to input a file containing benchmarks to run +def run_benchmarks(sorted_bins, builds_dir, output_filename, should_print_bin_output = False): + def get_knob_id(bin_name): + return int(bin_name[bin_name.rfind("_") + 1 : ]) + + output_file = open(output_filename, "w", buffering = 0) + for network_name in sorted_bins: + # Sort the binaries in order by knob id + sorted_bins[network_name].sort(key = get_knob_id) + print("--------------------------------------") + print(network_name) + # Go through all binaries + for bin_name in sorted_bins[network_name]: + print(bin_name) + output_file.write("%s results\n" % bin_name) + output_file.write("%s: %s\n" % (bin_name, \ parse_binary_output(run_benchmark(os.path.join(builds_dir, bin_name), \ should_print_bin_output)))) - print(bin_name) + print("--------------------------------------\n") output_file.close() @@ -48,4 +68,5 @@ if __name__ == "__main__": print("Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name>") exit(1) print("Output file name: %s" % sys.argv[2]) - run_benchmarks(sys.argv[1], sys.argv[2]) + sorted_bins = get_sorted_binaries(sys.argv[1]) + run_benchmarks(sorted_bins, sys.argv[1], sys.argv[2]) diff --git a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll index 3e48a094b89ac506cf50f712a0d60b1bac95f75d..89c8da90f8ab740062bd84cdd365baa67311a7a4 100644 --- a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll +++ b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll @@ -8,8 +8,8 @@ define void @_Z13dummyFunctionv() #0 { entry: %initRT = alloca i8*, align 8 %cleanRT = alloca i8*, align 8 - %initApproxhpvmRT = alloca i8*, align 8 - %cleaApproxhpvmRT = alloca i8*, align 8 + %initApproxRT = alloca i8*, align 8 + %cleanApproxRT = alloca i8*, align 8 %initRTController = alloca i8*, align 8 %cleanRTController = alloca i8*, align 8 %request_tensorPtr = alloca i8*, align 8 @@ -44,17 +44,18 @@ entry: %ConvLayer = alloca i8*, align 8 %FCLayer = alloca i8*, align 8 %ConvLayer2 = alloca i8*, align 8 + %ConvLayer3 = alloca i8*, align 8 %FCLayer2 = alloca i8*, align 8 %AddWrapper = alloca i8*, align 8 %ReluWrapper = alloca i8*, align 8 %TanhWrapper = alloca i8*, align 8 %BatchNormWrapper = alloca i8*, align 8 %PoolingWrapper = alloca i8*, align 8 - %SoftmaxWrapper = alloca i8*, align 8 + %softmaxWrapper = alloca i8*, align 8 store i8* bitcast (void (i32)* @llvm_hpvm_initTensorRt to i8*), i8** %initRT, align 8 store i8* bitcast (void ()* @llvm_hpvm_cleanupTensorRt to i8*), i8** %cleanRT, align 8 - store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxhpvmRT, align 8 - store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleaApproxhpvmRT, align 8 + store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxRT, align 8 + store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleanApproxRT, align 8 store i8* bitcast (void (i8*, i8*)* @llvm_hpvm_initializeRuntimeController to i8*), i8** %initRTController, align 8 store i8* bitcast (void ()* @llvm_hpvm_clearRuntimeController to i8*), i8** %cleanRTController, align 8 store i8* bitcast (void (i8*, i32)* @hpvm_request_tensor to i8*), i8** %request_tensorPtr, align 8 @@ -89,13 +90,14 @@ entry: store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, i32, i32, i32, i32, i32, i32, float, float, i32)* @ConvLayer_PROMISE to i8*), i8** %ConvLayer, align 8 store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, float, float, i32)* @FCLayer_PROMISE to i8*), i8** %FCLayer, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float)* @wrapper_ConvLayer to i8*), i8** %ConvLayer2, align 8 + store i8* bitcast (i8* (i8*, i8*, i8*, i32, i32, i32, i32, i32, i32)* @wrapper_tensorGroupConvolution to i8*), i8** %ConvLayer3, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, float, float)* @wrapper_FCLayer to i8*), i8** %FCLayer2, align 8 store i8* bitcast (i8* (i8*, i8*, i8*)* @wrapper_tensorAdd to i8*), i8** %AddWrapper, align 8 store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorRelu to i8*), i8** %ReluWrapper, align 8 store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorTanh to i8*), i8** %TanhWrapper, align 8 store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i8*, i8*, double)* @wrapper_tensorBatchNorm to i8*), i8** %BatchNormWrapper, align 8 store i8* bitcast (i8* (i8*, i8*, i32, i32, i32, i32, i32, i32, i32)* @wrapper_tensorPooling to i8*), i8** %PoolingWrapper, align 8 - store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %SoftmaxWrapper, align 8 + store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %softmaxWrapper, align 8 ret void } @@ -175,6 +177,8 @@ declare i8* @FCLayer_PROMISE(i8*, float, float, i8*, float, float, i8*, float, f declare i8* @wrapper_ConvLayer(i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float) #1 +declare i8* @wrapper_tensorGroupConvolution(i8*, i8*, i8*, i32, i32, i32, i32, i32, i32) #1 + declare i8* @wrapper_FCLayer(i8*, i8*, i8*, i8*, i32, float, float) #1 declare i8* @wrapper_tensorAdd(i8*, i8*, i8*) #1 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..04336fca2708d5e5d78849e1c12014f5ddbd1ad7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt @@ -0,0 +1,6 @@ +11894784.000000 +39321600.000000 +21233664.000000 +28311552.000000 +18874368.000000 +20480.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a5722f202dde469dca94c71dd9c5fc1cd7aa32b --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt @@ -0,0 +1,7 @@ +88473.601562 +943718.375000 +471859.187500 +943718.375000 +471859.187500 +943718.375000 +2048.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..74b1b668e2f27f3ddb77dcac7fff9890c70a6f02 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt @@ -0,0 +1,4 @@ +62720.000000 +1003520.000000 +321126.406250 +1024.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..673e704b7e37e19c090e98799189a4411bad9f7c --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt @@ -0,0 +1,28 @@ +88473.601562 +29491.199219 +209715.203125 +14745.599609 +209715.203125 +29491.199219 +419430.406250 +7372.799805 +209715.203125 +14745.599609 +419430.406250 +3686.399902 +209715.203125 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +7372.799805 +419430.406250 +1843.199951 +209715.203125 +3686.399902 +419430.406250 +1024.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..7266441905a08c1ef1796dec8ee6c05660998378 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt @@ -0,0 +1,8 @@ +265420.812500 +629145.625000 +629145.625000 +1258291.250000 +629145.625000 +1258291.250000 +629145.625000 +6144.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdba070cfc5eac559c8384306993fb52a1eb2e04 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt @@ -0,0 +1,22 @@ +44236.800781 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +117964.796875 +235929.593750 +13107.200195 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +117964.796875 +235929.593750 +13107.200195 +235929.593750 +235929.593750 +235929.593750 +235929.593750 +64.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f58ebcc043915d28cf874a1f67e5b2637db1dfc --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt @@ -0,0 +1,15 @@ +88473.601562 +1887436.750000 +943718.375000 +1887436.750000 +943718.375000 +1887436.750000 +1887436.750000 +943718.375000 +1887436.750000 +1887436.750000 +471859.187500 +471859.187500 +471859.187500 +13107.200195 +256.000000 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6daad2e2902e3ac821d99ebbe12e21b6428cc7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt @@ -0,0 +1,15 @@ +884736.000000 +18874368.000000 +9437184.000000 +18874368.000000 +9437184.000000 +18874368.000000 +18874368.000000 +9437184.000000 +18874368.000000 +18874368.000000 +4718592.000000 +4718592.000000 +4718592.000000 +131072.000000 +25600.000000 diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index b482cef5377e0f879b43f06a7ebbfbe01b39be09..14dc8f20f2111e85e82630cdbcc0c695a39c5ecd 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -72,7 +72,9 @@ public: FP32, FP16, PERFORATION, -// INPUT_SAMPLING, + INPUT_SAMPLING, + REDUCTION_SAMPLING, +// ADDITIONAL_APPROXIMATION_METHOD APPROX_END }; @@ -91,6 +93,15 @@ public: POOL_MEAN, POOL_MIN, SOFTMAX, + FFT, + REDUCE, + PROJECTIVE_T, + MAP1, + MAP2, + MAP3, +// STENCIL, +// COSINE_T, +// ADDITIONAL_TENSOR_OPERATION TENSOR_OP_END }; @@ -269,6 +280,24 @@ void GPUNodeConfiguration::print() { case TENSOR_OP::SOFTMAX : DEBUG("softmax"); break; + case TENSOR_OP::FFT : + DEBUG("fft"); + break; + case TENSOR_OP::REDUCE : + DEBUG("reduce"); + break; + case TENSOR_OP::PROJECTIVE_T : + DEBUG("projectiveT"); + break; + case TENSOR_OP::MAP1 : + DEBUG("map1"); + break; + case TENSOR_OP::MAP2 : + DEBUG("map2"); + break; + case TENSOR_OP::MAP3 : + DEBUG("map3"); + break; default : ERROR("Unknown tensor operation."); break; @@ -288,6 +317,12 @@ void GPUNodeConfiguration::print() { case APPROX::PERFORATION : DEBUG("perf"); break; + case APPROX::INPUT_SAMPLING : + DEBUG("input_samp"); + break; + case APPROX::REDUCTION_SAMPLING : + DEBUG("red_samp"); + break; default: ERROR("Unknown approximation option"); break; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h index 911f42b955a72cb756aadc1fc78231187ef3394e..21c6df7f1749e891dba257bbb1933c3beefb8c4f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h @@ -735,6 +735,30 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG ("Found softmax operation\n"); NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX); idx++; + } else if (tokens[idx] == "fft") { + DEBUG ("Found fft operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT); + idx++; + } else if (tokens[idx] == "reduce") { + DEBUG ("Found reduce operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE); + idx++; + } else if (tokens[idx] == "projectiveT") { + DEBUG ("Found projectiveT operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T); + idx++; + } else if (tokens[idx] == "map1") { + DEBUG ("Found map1 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1); + idx++; + } else if (tokens[idx] == "map2") { + DEBUG ("Found map2 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2); + idx++; + } else if (tokens[idx] == "map3") { + DEBUG ("Found map3 operation\n"); + NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3); + idx++; } else /*Not a new operation. This means an approximation option*/ if (tokens[idx] == "fp32") { DEBUG("Found fp32 option\n"); @@ -756,6 +780,18 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("perf parameter: %d\n", perf); NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf); idx += 2; + } else if (tokens[idx] == "input_samp") { + DEBUG("Found input_samp option\n"); + int input_samp = std::stoi(tokens[idx+1]); + DEBUG("input_samp parameter: %d\n", input_samp); + NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, input_samp); + idx += 2; + } else if (tokens[idx] == "red_samp") { + DEBUG("Found red_samp option\n"); + int red_samp = std::stoi(tokens[idx+1]); + DEBUG("red_samp parameter: %d\n", red_samp); + NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp); + idx += 2; } // TODO: other approximation options handled here diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu index 13ef262bac301e35d7c3e6306a3706dfe79a68a2..282a0cbb68de4f033b46cdc5c4a8ad69aa1f20c0 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu @@ -329,12 +329,18 @@ extern "C"{ void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){ Tensor* tensor = (Tensor*) tensor_ptr; - - if(tensor->size_in_bytes != size_in_bytes){ + + size_t host_size_in_bytes = tensor->num_elems * 4; + //if(tensor->size_in_bytes != size_in_bytes){ + if(host_size_in_bytes != size_in_bytes){ ERROR("The destination and source sizes don't match"); } std::memcpy(tensor->host_data, data_ptr, size_in_bytes); + + changeTensorPlacement(tensor, HOST); + + tensor->cur_type = float_type; } @@ -428,7 +434,7 @@ extern "C"{ -bool ONLINE_PROFILING = false; +bool ONLINE_PROFILING = false; // true; void convertToFP16(struct Tensor* tensor){ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile index 4e762ea9894405bb375f518b65c209b4129d9f70..83b4dc9431ee84051def8a0f6850e7f2c194f033 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile @@ -1,5 +1,6 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks # NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -15,9 +16,10 @@ APP = alexnet TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a - CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL @@ -58,15 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - $(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc - $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_linked.bc - $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_loop_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_linked $(LINKER_FLAGS) - #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp index ee07bdd8f9901f1582d5f7642a2a86c099397a14..d92bc0c45d1115620d529aea4636ece8d3d62127 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp @@ -9,8 +9,10 @@ #include <tensorTypes.h> #include <tensorUtils.h> + + void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 5, 5, 1, 1); @@ -18,7 +20,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -26,7 +28,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_2_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -34,7 +36,7 @@ void var_2_node(void* t1, size_t bytes_t1) { } void var_3_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -42,7 +44,7 @@ void var_3_node(void* t1, size_t bytes_t1) { } void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -50,7 +52,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -58,7 +60,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_6_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -66,7 +68,7 @@ void var_6_node(void* t1, size_t bytes_t1) { } void var_7_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -74,7 +76,7 @@ void var_7_node(void* t1, size_t bytes_t1) { } void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -82,7 +84,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -90,7 +92,7 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_10_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -98,7 +100,7 @@ void var_10_node(void* t1, size_t bytes_t1) { } void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -106,7 +108,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -114,7 +116,7 @@ void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_13_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -122,7 +124,7 @@ void var_13_node(void* t1, size_t bytes_t1) { } void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -130,7 +132,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -138,7 +140,7 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_16_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -146,7 +148,7 @@ void var_16_node(void* t1, size_t bytes_t1) { } void var_17_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -154,7 +156,7 @@ void var_17_node(void* t1, size_t bytes_t1) { } void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_mul(t1, t2); @@ -162,7 +164,7 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -177,6 +179,8 @@ void var_20_node(void* t1, size_t bytes_t1) { __visc__return(2, r, (size_t) 0); } + + void root(void* input, size_t input_bytes, void* conv2d_1_w, size_t conv2d_1_w_bytes, void* conv2d_1_b, size_t conv2d_1_b_bytes, @@ -371,9 +375,10 @@ int main(){ std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/alexnet_cifar10_test/"); - + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); std::string labels_path = dir_prefix + std::string("labels32.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(),10000); + uint8_t* labels = readLabels(labels_path.c_str(),5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); @@ -404,6 +409,8 @@ int main(){ __visc__init(); RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); + //args->input = input; + //args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -429,48 +436,38 @@ int main(){ args->dense_1_b = dense_1_b; args->dense_1_b_bytes = 0; - int batch_size = 500; - int test_input_size = 10000; - int batch_count = test_input_size / batch_size; - - std::string input_path = dir_prefix + std::string("input.bin"); + int batch_size = 500; + int test_input_size = 10000; + int batch_count = test_input_size / batch_size; + void* input = create4DTensor(0,nchw,batch_size,3,32,32); - startMemTracking(); - for (int i = 0; i < batch_count; i++){ + startProfiling(); - int start = i * batch_size; - int end = (i + 1) * batch_size; + for (int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; copyInputBatch(input_path.c_str(),start,end,3,32,32, input); - - args->input = input; + + args->input = input; args->input_bytes = 0; - - //void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* dfg = __visc__launch(0, root, (void*) args); + + void* dfg = __visc__launch(0, root, (void*) args); __visc__wait(dfg); + + void *result = static_cast<RootIn*>(args)->input; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - void *result = static_cast<RootIn*>(args)->input; - hpvm_request_tensor(result, 0); - - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - computeAccuracy3(labels, result); - - llvm_hpvm_invokeRtControl2(result, labels); - freeBatchMemory(); } - - - __visc__cleanup(); - - + stopProfiling(); + __visc__cleanup(); + return 0; - -} +}