diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp index 7ea0c1dce23cf94385df3089c499338bec281b64..c54dd9ef3bbf6fbd8e75cbb4bc488cb215e580ce 100644 --- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -857,7 +857,7 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(7)); // Create wrapper API runtime function call - Constant* wrapper_tensorGroupConvolution; + Constant* wrapper_tensorGroupConvolution = M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"), RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution, @@ -895,9 +895,9 @@ errs() << "TensorII: " << *TensorII << "\n"; Args.push_back(TensorII->getOperand(3)); Args.push_back(TensorII->getOperand(4)); Args.push_back(TensorII->getOperand(5)); - + // Create wrapper API runtime function call - Constant* wrapper_tensorBatchNorm; + Constant* wrapper_tensorBatchNorm = M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"), RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType()); CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm, diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py index 5f5c28032d721dcf1e77ab52407a165c0251deb2..48320ca197a497f44d164a1366128fbeff2b7352 100644 --- a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py @@ -27,17 +27,37 @@ def parse_binary_output(proc_output): return avg_time -# Input: a list of tuples of benchmark names -# Can change to input a file containing benchmarks to run -def run_benchmarks(builds_dir, output_filename, should_print_bin_output = True): - output_file = open(output_filename, "w") +def get_sorted_binaries(builds_dir): + # dict of network names to lists of binaries + # list of binaries should be in sorted order (can do that when we run the benchmarks) + network_bins = defaultdict(list) for bin_name in os.listdir(builds_dir): if bin_name.find("profiling") == -1: continue - output_file.write("%s: %s\n" % (bin_name, \ + network_name = bin_name[ : bin_name.rfind("_")] + network_bins[network_name].append(bin_name) + return network_bins + +# Input: a list of tuples of benchmark names +# Can change to input a file containing benchmarks to run +def run_benchmarks(sorted_bins, builds_dir, output_filename, should_print_bin_output = False): + def get_knob_id(bin_name): + return int(bin_name[bin_name.rfind("_") + 1 : ]) + + output_file = open(output_filename, "w", buffering = 0) + for network_name in sorted_bins: + # Sort the binaries in order by knob id + sorted_bins[network_name].sort(key = get_knob_id) + print("--------------------------------------") + print(network_name) + # Go through all binaries + for bin_name in sorted_bins[network_name]: + print(bin_name) + output_file.write("%s results\n" % bin_name) + '''output_file.write("%s: %s\n" % (bin_name, \ parse_binary_output(run_benchmark(os.path.join(builds_dir, bin_name), \ - should_print_bin_output)))) - print(bin_name) + should_print_bin_output))))''' + print("--------------------------------------\n") output_file.close() @@ -48,4 +68,5 @@ if __name__ == "__main__": print("Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name>") exit(1) print("Output file name: %s" % sys.argv[2]) - run_benchmarks(sys.argv[1], sys.argv[2]) + sorted_bins = get_sorted_binaries(sys.argv[1]) + run_benchmarks(sorted_bins, sys.argv[1], sys.argv[2]) diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile index 4e762ea9894405bb375f518b65c209b4129d9f70..83b4dc9431ee84051def8a0f6850e7f2c194f033 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile @@ -1,5 +1,6 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks # NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -15,9 +16,10 @@ APP = alexnet TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a - CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL @@ -58,15 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - $(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc - $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_linked.bc - $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_loop_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_linked $(LINKER_FLAGS) - #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp index ee07bdd8f9901f1582d5f7642a2a86c099397a14..d92bc0c45d1115620d529aea4636ece8d3d62127 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp @@ -9,8 +9,10 @@ #include <tensorTypes.h> #include <tensorUtils.h> + + void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 5, 5, 1, 1); @@ -18,7 +20,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -26,7 +28,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_2_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -34,7 +36,7 @@ void var_2_node(void* t1, size_t bytes_t1) { } void var_3_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -42,7 +44,7 @@ void var_3_node(void* t1, size_t bytes_t1) { } void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -50,7 +52,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -58,7 +60,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_6_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -66,7 +68,7 @@ void var_6_node(void* t1, size_t bytes_t1) { } void var_7_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -74,7 +76,7 @@ void var_7_node(void* t1, size_t bytes_t1) { } void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -82,7 +84,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -90,7 +92,7 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_10_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -98,7 +100,7 @@ void var_10_node(void* t1, size_t bytes_t1) { } void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -106,7 +108,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -114,7 +116,7 @@ void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_13_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -122,7 +124,7 @@ void var_13_node(void* t1, size_t bytes_t1) { } void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -130,7 +132,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -138,7 +140,7 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_16_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_tanh(t1); @@ -146,7 +148,7 @@ void var_16_node(void* t1, size_t bytes_t1) { } void var_17_node(void* t1, size_t bytes_t1) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -154,7 +156,7 @@ void var_17_node(void* t1, size_t bytes_t1) { } void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_mul(t1, t2); @@ -162,7 +164,7 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { } void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { - __visc__hint(visc::CUDNN_TARGET); + __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); void *r = __visc__tensor_add(t1, t2); @@ -177,6 +179,8 @@ void var_20_node(void* t1, size_t bytes_t1) { __visc__return(2, r, (size_t) 0); } + + void root(void* input, size_t input_bytes, void* conv2d_1_w, size_t conv2d_1_w_bytes, void* conv2d_1_b, size_t conv2d_1_b_bytes, @@ -371,9 +375,10 @@ int main(){ std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/alexnet_cifar10_test/"); - + std::string input_path = dir_prefix + std::string("input.bin"); + //void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); std::string labels_path = dir_prefix + std::string("labels32.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(),10000); + uint8_t* labels = readLabels(labels_path.c_str(),5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); @@ -404,6 +409,8 @@ int main(){ __visc__init(); RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); + //args->input = input; + //args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -429,48 +436,38 @@ int main(){ args->dense_1_b = dense_1_b; args->dense_1_b_bytes = 0; - int batch_size = 500; - int test_input_size = 10000; - int batch_count = test_input_size / batch_size; - - std::string input_path = dir_prefix + std::string("input.bin"); + int batch_size = 500; + int test_input_size = 10000; + int batch_count = test_input_size / batch_size; + void* input = create4DTensor(0,nchw,batch_size,3,32,32); - startMemTracking(); - for (int i = 0; i < batch_count; i++){ + startProfiling(); - int start = i * batch_size; - int end = (i + 1) * batch_size; + for (int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; copyInputBatch(input_path.c_str(),start,end,3,32,32, input); - - args->input = input; + + args->input = input; args->input_bytes = 0; - - //void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* dfg = __visc__launch(0, root, (void*) args); + + void* dfg = __visc__launch(0, root, (void*) args); __visc__wait(dfg); + + void *result = static_cast<RootIn*>(args)->input; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - void *result = static_cast<RootIn*>(args)->input; - hpvm_request_tensor(result, 0); - - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - computeAccuracy3(labels, result); - - llvm_hpvm_invokeRtControl2(result, labels); - freeBatchMemory(); } - - - __visc__cleanup(); - - + stopProfiling(); + __visc__cleanup(); + return 0; - -} +}