diff --git a/README.md b/README.md index cc8891b8b4d005758306614569a5253b2249c94b..3cac5cadb819ef890eb12bad858816c9c6dcbd2b 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Build hpvm ```shell mkdir install mkdir build && cd build -cmake ../llvm -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DCMAKE_INSTALL_PREFIX=../install +cmake ../llvm -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="X86" -DCMAKE_INSTALL_PREFIX=../install make -j<number of threads you want to use to build hpvm> ``` diff --git a/llvm/projects/hpvm-tensor-rt/bin/times.py b/llvm/projects/hpvm-tensor-rt/bin/times.py new file mode 100644 index 0000000000000000000000000000000000000000..082b0d91acb19e70a6c217b25f8747f3197b45b7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/times.py @@ -0,0 +1,78 @@ + + + +class Config: + def __init__(self): + self.runtime = 0 + self.fed_runs = 0 + self.full_runs = 0 + + +def computeTimes(bench): + + conf_runs = 60 + fed_time = (bench.runtime * 100) + (bench.fed_runs * conf_runs * bench.runtime) + fed_time_hrs = fed_time / (60*60) + + full_time = (bench.runtime * 1000) + (bench.full_runs * conf_runs * bench.runtime) + full_time_hrs = full_time / (60*60) + + print ("fedtime_hrs = ", fed_time_hrs, " full_time_hrs = ", full_time_hrs, "\n") + + + +if __name__ == "__main__": + + + resnet = Config() + resnet.runtime = 8 + resnet.fed_runs = 3 + resnet.full_runs = 5 + + computeTimes(resnet) + + alexnet = Config() + alexnet.runtime = 7.8 + alexnet.fed_runs = 47 + alexnet.full_runs = 274 + + computeTimes(alexnet) + + alexnet2 = Config() + alexnet2.runtime = 2.3 + alexnet2.fed_runs = 62 + alexnet2.full_runs = 339 + + computeTimes(alexnet2) + + vgg1 = Config() + vgg1.runtime = 7.4 + vgg1.fed_runs = 15 + vgg1.full_runs = 211 + + computeTimes(vgg1) + + + vgg2 = Config() + vgg2.runtime = 15.4 + vgg2.fed_runs = 8 + vgg2.full_runs = 150 + + computeTimes(vgg2) + + + lenet = Config() + lenet.runtime = 0.98 + lenet.fed_runs = 64 + lenet.full_runs = 228 + + computeTimes(lenet) + + + mobilenet = Config() + mobilenet.runtime = 11 + mobilenet.fed_runs = 32 + mobilenet.full_runs = 267 + + computeTimes(mobilenet) + diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..095e037430dbf1751dddfd047d0cf0157ad9e2e7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt @@ -0,0 +1,119 @@ +cmake_minimum_required (VERSION 2.6) +project (cudnn-training) + +find_package(CUDA 6.5 REQUIRED) + + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + message("Debug mode") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=compute_60;-std=c++11;-g;-lineinfo;-Xcompiler;-ggdb;-lcurand) +else() + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=compute_60;-std=c++11;-DNDEBUG;-Xcompiler;-DNDEBUG;-lcurand) +endif() + +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -I/ " ) + +add_definitions(-DNO_INJECTION) +add_definitions(-DPROMISE_TUNER_ENABLED) +if(USE_GFLAGS) + add_definitions(-DUSE_GFLAGS) +endif() + +if(USE_AUTOTUNER) + remove_definitions(-DNO_INJECTION) +endif() + + + +include_directories($ENV{CUDNN_PATH} /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/include) +include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/./tensor_runtime/include) +include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../gpu_profiler/include) +include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../soc_simulator/include) +link_directories($ENV{CUDNN_PATH} /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/lib /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/lib64) + + +cuda_add_library(tensor_runtime /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu) +cuda_add_cublas_to_target(tensor_runtime) + +cuda_add_library(tensor_cpu_runtime /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc) + +find_library(GPU_PROFILER_LIB + NAMES libgpu_profiler.a + HINTS /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../gpu_profiler/lib +) + +find_library(SOC_SIMULATOR_LIB + NAMES libpromise_profiler.a + HINTS /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../soc_simulator/lib +) + + +if(USE_GFLAGS) + target_link_libraries(tensor_runtime gflags cudnn -lcurand) +else() + target_link_libraries(tensor_runtime cudnn -lcurand) +endif() + +target_link_libraries(tensor_cpu_runtime) + +# lenet_keras_half_autogenerated_knobs +add_executable(lenet_keras_fp16_perf20 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf20.cc) +target_link_libraries(lenet_keras_fp16_perf20 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf26 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf26.cc) +target_link_libraries(lenet_keras_fp16_perf26 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf22 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf22.cc) +target_link_libraries(lenet_keras_fp16_perf22 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf25 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf25.cc) +target_link_libraries(lenet_keras_fp16_perf25 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf23 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf23.cc) +target_link_libraries(lenet_keras_fp16_perf23 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp33 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp33.cc) +target_link_libraries(lenet_keras_fp16_samp33 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf24 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf24.cc) +target_link_libraries(lenet_keras_fp16_perf24 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp31 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp31.cc) +target_link_libraries(lenet_keras_fp16_samp31 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf30 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf30.cc) +target_link_libraries(lenet_keras_fp16_perf30 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp36 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp36.cc) +target_link_libraries(lenet_keras_fp16_samp36 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf21 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf21.cc) +target_link_libraries(lenet_keras_fp16_perf21 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp34 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp34.cc) +target_link_libraries(lenet_keras_fp16_samp34 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp32 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp32.cc) +target_link_libraries(lenet_keras_fp16_samp32 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_samp35 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp35.cc) +target_link_libraries(lenet_keras_fp16_samp35 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf29 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf29.cc) +target_link_libraries(lenet_keras_fp16_perf29 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf27 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf27.cc) +target_link_libraries(lenet_keras_fp16_perf27 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +add_executable(lenet_keras_fp16_perf28 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf28.cc) +target_link_libraries(lenet_keras_fp16_perf28 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + + + +# lenet_keras_autogenerated_knobs +add_executable(lenet_keras_fp32_perf20 lenet_keras_autogenerated_knobs/lenet_keras_fp32_perf20.cc) +target_link_libraries(lenet_keras_fp32_perf20 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + + diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt new file mode 100644 index 0000000000000000000000000000000000000000..563d7f4a03b3b3a50e2c08c76616a88ea7958b5a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt @@ -0,0 +1,7 @@ +../dnn_sources/src/half/profiling/alexnet2_cifar10_half_profiling.cc +../dnn_sources/src/half/profiling/alexnet_cifar10_half_profiling.cc +../dnn_sources/src/half/profiling/mobilenet_depthwise_half_profiling.cc +../dnn_sources/src/half/profiling/mobilenet_shallow_depthwise_half_profiling.cc +../dnn_sources/src/half/profiling/resnet18_cifar10_half_profiling.cc +../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc +../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a0beb250e2241c7523e69b5262cb9ffc977d28d --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt @@ -0,0 +1,3 @@ +../dnn_sources/src/half/profiling/alexnet2_cifar10_half_profiling.cc +../dnn_sources/src/half/profiling/alexnet_cifar10_half_profiling.cc +../dnn_sources/src/half/profiling/resnet18_cifar10_half_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt new file mode 100644 index 0000000000000000000000000000000000000000..20ca95abcf1ee1aab337fa391abb5f1a74583fe1 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt @@ -0,0 +1,4 @@ +../dnn_sources/src/half/profiling/mobilenet_depthwise_half_profiling.cc +../dnn_sources/src/half/profiling/mobilenet_shallow_depthwise_half_profiling.cc +../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc +../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt new file mode 100644 index 0000000000000000000000000000000000000000..506497e42889dc1d8bb2465912e87f56464e7ecc --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt @@ -0,0 +1 @@ +../dnn_sources/src/half/lenet_keras_half.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt new file mode 100644 index 0000000000000000000000000000000000000000..12b87930416c4269a62f2020a06b42cf5cf4dc13 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt @@ -0,0 +1,9 @@ +../dnn_sources/src/profiling/alexnet2_profiling.cc +../dnn_sources/src/profiling/alexnet_cifar10_profiling.cc +../dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc +../dnn_sources/src/profiling/mobilenet_shallow_profiling.cc +../dnn_sources/src/profiling/mobilenet_depthwise_profiling.cc +../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc +../dnn_sources/src/profiling/resnet18_cifar10_profiling.cc +../dnn_sources/src/profiling/vgg16_cifar100_profiling.cc +../dnn_sources/src/profiling/vgg16_cifar10_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd8f03c30712f0162db2cc8bcf563087be05bf64 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt @@ -0,0 +1 @@ +../dnn_sources/src/lenet_keras.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59f773cda240a311c0c873c9366494018b87312 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt @@ -0,0 +1 @@ +../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7382da3570917c1983ad0c3fe02763d8565635 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt @@ -0,0 +1,2 @@ +../dnn_sources/src/profiling/mobilenet_depthwise_profiling.cc +../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt new file mode 100644 index 0000000000000000000000000000000000000000..32b18d4ca22672be6b44ecb674ea3ad00e18276d --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt @@ -0,0 +1,2 @@ +../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc +../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt new file mode 100644 index 0000000000000000000000000000000000000000..207eb1ed1f45ffde7dad0da4e125aa0ceaa5c5cd --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt @@ -0,0 +1,17 @@ +perf,20 1,1,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,21 1,2,1,0 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,22 1,2,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,23 1,3,1,0 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,24 1,3,1,1 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,25 1,3,1,2 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,26 2,1,1,0 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,27 2,1,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,28 3,1,1,0 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,29 3,1,1,1 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,30 3,1,1,2 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,31 1,1,2,0 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,32 1,1,2,1 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,33 1,1,4,0 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,34 1,1,4,1 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,35 1,1,4,2 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,36 1,1,4,3 1.88 tensorHalfConvolution tensorConvApproxHalf diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc76565110cf34ab57024dd852c1a51b23a8f45e --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt @@ -0,0 +1,6 @@ +samp,31 1,1,2,0 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,32 1,1,2,1 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,33 1,1,4,0 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,34 1,1,4,1 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,35 1,1,4,2 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,36 1,1,4,3 1.88 tensorHalfConvolution tensorConvApproxHalf diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt new file mode 100644 index 0000000000000000000000000000000000000000..72c43e61288c532feed94f5768357b3113d5de49 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt @@ -0,0 +1,18 @@ +perf,20 1,1,0 2.25 tensorHalfConvolution tensorConvPerfCudaHalf +perf,21 1,2,0 2.25 tensorHalfConvolution tensorConvPerfCudaHalf +perf,22 1,2,1 2.25 tensorHalfConvolution tensorConvPerfCudaHalf +perf,23 1,3,0 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +perf,24 1,3,1 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +perf,25 1,3,2 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +perf,26 2,1,0 2.25 tensorHalfConvolution tensorConvPerfCudaHalf +perf,27 2,1,1 2.25 tensorHalfConvolution tensorConvPerfCudaHalf +perf,28 3,1,0 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +perf,29 3,1,1 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +perf,30 3,1,2 1.88 tensorHalfConvolution tensorConvPerfCudaHalf +samp,31 2,0 1.88 tensorHalfConvolution tensorConvInputHalf +samp,32 2,1 1.88 tensorHalfConvolution tensorConvInputHalf +samp,33 4,0 1.88 tensorHalfConvolution tensorConvInputHalf +samp,34 4,1 1.88 tensorHalfConvolution tensorConvInputHalf +samp,35 4,2 1.88 tensorHalfConvolution tensorConvInputHalf +samp,36 4,3 1.88 tensorHalfConvolution tensorConvInputHalf +samp,37 1,1 1.88 tensorHalfConvolution tensorConvInputHalf diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f0593226f6fbeddda91046e7416fe108bfb6d90 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt @@ -0,0 +1,7 @@ +samp,31 2,0 1.88 tensorHalfConvolution tensorConvInputHalf +samp,32 2,1 1.88 tensorHalfConvolution tensorConvInputHalf +samp,33 4,0 1.88 tensorHalfConvolution tensorConvInputHalf +samp,34 4,1 1.88 tensorHalfConvolution tensorConvInputHalf +samp,35 4,2 1.88 tensorHalfConvolution tensorConvInputHalf +samp,36 4,3 1.88 tensorHalfConvolution tensorConvInputHalf +samp,37 1,1 1.88 tensorHalfConvolution tensorConvInputHalf diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a172a4e515ebfd24a51267da8bac2cb5f13ce6c0 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt @@ -0,0 +1,13 @@ +perf,20 1,1,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,21 1,2,1,0 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,22 1,2,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,23 1,3,1,0 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,24 1,3,1,1 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,25 1,3,1,2 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,26 2,1,1,0 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,27 2,1,1,1 2.25 tensorHalfConvolution tensorConvApproxHalf +perf,28 3,1,1,0 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,29 3,1,1,1 1.88 tensorHalfConvolution tensorConvApproxHalf +perf,30 3,1,1,2 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,32 1,1,2,1 1.88 tensorHalfConvolution tensorConvApproxHalf +samp,36 1,1,4,3 1.88 tensorHalfConvolution tensorConvApproxHalf diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt new file mode 100644 index 0000000000000000000000000000000000000000..78f3e361ee8a96c6520793b435815210e1fc7117 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt @@ -0,0 +1,17 @@ +perf,20 1,1,1,1 2.25 tensorConvolution tensorConvApprox +perf,21 1,2,1,0 2.25 tensorConvolution tensorConvApprox +perf,22 1,2,1,1 2.25 tensorConvolution tensorConvApprox +perf,23 1,3,1,0 1.88 tensorConvolution tensorConvApprox +perf,24 1,3,1,1 1.88 tensorConvolution tensorConvApprox +perf,25 1,3,1,2 1.88 tensorConvolution tensorConvApprox +perf,26 2,1,1,0 2.25 tensorConvolution tensorConvApprox +perf,27 2,1,1,1 2.25 tensorConvolution tensorConvApprox +perf,28 3,1,1,0 1.88 tensorConvolution tensorConvApprox +perf,29 3,1,1,1 1.88 tensorConvolution tensorConvApprox +perf,30 3,1,1,2 1.88 tensorConvolution tensorConvApprox +samp,31 1,1,2,0 1.88 tensorConvolution tensorConvApprox +samp,32 1,1,2,1 1.88 tensorConvolution tensorConvApprox +samp,33 1,1,4,0 1.88 tensorConvolution tensorConvApprox +samp,34 1,1,4,1 1.88 tensorConvolution tensorConvApprox +samp,35 1,1,4,2 1.88 tensorConvolution tensorConvApprox +samp,36 1,1,4,3 1.88 tensorConvolution tensorConvApprox diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt new file mode 100644 index 0000000000000000000000000000000000000000..df001ba497d0ed440dd34beead33d607651d3f35 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt @@ -0,0 +1 @@ +perf,20 1,1,1,1 2.25 tensorConvolution tensorConvApprox diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt new file mode 100644 index 0000000000000000000000000000000000000000..36a7dbca05ef71b6046a91066acf5382f2a5c7a3 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt @@ -0,0 +1,11 @@ +perf,20 1,1,0 2.25 tensorConvolution tensorConvPerfCuda +perf,21 1,2,0 2.25 tensorConvolution tensorConvPerfCuda +perf,22 1,2,1 2.25 tensorConvolution tensorConvPerfCuda +perf,23 1,3,0 1.88 tensorConvolution tensorConvPerfCuda +perf,24 1,3,1 1.88 tensorConvolution tensorConvPerfCuda +perf,25 1,3,2 1.88 tensorConvolution tensorConvPerfCuda +perf,26 2,1,0 2.25 tensorConvolution tensorConvPerfCuda +perf,27 2,1,1 2.25 tensorConvolution tensorConvPerfCuda +perf,28 3,1,0 1.88 tensorConvolution tensorConvPerfCuda +perf,29 3,1,1 1.88 tensorConvolution tensorConvPerfCuda +perf,30 3,1,2 1.88 tensorConvolution tensorConvPerfCuda diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt new file mode 100644 index 0000000000000000000000000000000000000000..913397cc4936bf11f3eefa15b5804700865e7b6b --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt @@ -0,0 +1 @@ +fp16,12 0 1.5 tensorConvolution tensorHalfConvolution diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fbab7d7b85255cd86748634faea0bf48ed75e42 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt @@ -0,0 +1,13 @@ +perf,20 1,1,1,1 2.25 tensorConvolution tensorConvApprox +perf,21 1,2,1,0 2.25 tensorConvolution tensorConvApprox +perf,22 1,2,1,1 2.25 tensorConvolution tensorConvApprox +perf,23 1,3,1,0 1.88 tensorConvolution tensorConvApprox +perf,24 1,3,1,1 1.88 tensorConvolution tensorConvApprox +perf,25 1,3,1,2 1.88 tensorConvolution tensorConvApprox +perf,26 2,1,1,0 2.25 tensorConvolution tensorConvApprox +perf,27 2,1,1,1 2.25 tensorConvolution tensorConvApprox +perf,28 3,1,1,0 1.88 tensorConvolution tensorConvApprox +perf,29 3,1,1,1 1.88 tensorConvolution tensorConvApprox +perf,30 3,1,1,2 1.88 tensorConvolution tensorConvApprox +samp,32 1,1,2,1 1.88 tensorConvolution tensorConvApprox +samp,36 1,1,4,3 1.88 tensorConvolution tensorConvApprox diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..68686b25de1c607e34d75044cd7ff19cf0c8890a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt @@ -0,0 +1 @@ +fp16,12 0 1.5 tensorHalfConvolution tensorHalfConvolution diff --git a/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py b/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py index f53573f7cde9420400194827d55d84d69e2ace5b..d6c3d63112c83cd9b545914a9a33f4c5b5dae6ce 100644 --- a/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py +++ b/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py @@ -4,8 +4,6 @@ import subprocess import sys class Driver: - fp16_swing = 8 - class PrecisionTypes: FP16 = 0 FP32 = 1 @@ -14,6 +12,7 @@ class Driver: class ApproxTypes: PERF = 3 SAMP = 4 + REDUCE = 5 results_time_key = "Time" results_energy_key = "Energy" @@ -65,7 +64,8 @@ class Driver: return "PERF" elif appr == Driver.ApproxTypes.SAMP: return "SAMP" - + elif appr == Driver.ApproxTypes.REDUCE: + return "REDUCE" def driver(self): self.__parse_tensor_layer_file() @@ -189,7 +189,6 @@ class Driver: curr_conf_results.append((layer_as_lst[1], layer_results)) line = config_file.readline().strip() continue - layer_ind = int(layer_as_lst[0]) - 1 layer_table_data = self.__tensor_layers[layer_ind] layer_name = layer_table_data["Name"] @@ -208,6 +207,8 @@ class Driver: time, energy = self.__run_promise_simulation(param_val, layer_table_data) total_time += time total_energy += energy + print("Curr promise: ", time, energy) + print("Total promise: ", total_time, total_energy) layer_results.append((total_time, total_energy, ' '.join(layer_as_lst[2:]))) elif Driver.is_gpu(layer_as_lst[1]): @@ -227,22 +228,23 @@ class Driver: curr_layer = Driver.PrecisionTypes.FP16 elif line.find("fp32") != -1: curr_layer = Driver.PrecisionTypes.FP32 - if precision_type == "perf" or precision_type == "samp": # Handle approx type + if precision_type == "perf" or precision_type == "samp" or precision_type == "reduce": # Handle approx type if precision_type == "perf": approx_type = Driver.ApproxTypes.PERF elif precision_type == "samp": approx_type = Driver.ApproxTypes.SAMP + elif precision_type == "reduce": + approx_type = Driver.ApproxTypes.REDUCE curr_layer = Driver.PrecisionTypes.FP16 - print(curr_layer, prev_layer) quant_time, quant_energy = self.__quantize(precision_type, op_number, curr_layer, prev_layer, tensor_count, layer_table_data) if quant_time != 0: assert i == 2 #and layer_ind == 0 conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, \ tensor_count, approx_type, op_number) + print(quant_time, conv_time) layer_results.append((quant_time + conv_time, quant_energy + conv_energy, ' '.join(layer_as_lst[i : i + 3]))) prev_layer = curr_layer tensor_count += 1 - line = config_file.readline().strip() prev_layer = curr_layer curr_conf_results.append((layer_as_lst[1], layer_results)) @@ -256,9 +258,8 @@ class Driver: has_quantized = False for layer_ind, (hardware, layer) in enumerate(curr_conf_results): - if len(layer) == 1 and layer[0][2].find("softmax") != -1: continue + if layer[0][2].find("softmax") != -1: continue fp16_layer = [] - #print(layer_ind, hardware, layer) layer_table_data = self.__tensor_layers[layer_ind] layer_name = layer_table_data["Name"] @@ -287,7 +288,8 @@ class Driver: or prev_layer == Driver.PrecisionTypes.PROMISE: return 0.0, 0.0 layer_name = layer_data["Name"] - + print("QUANTIZATION") + print(precision_type, op_number, self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind, layer_data) # NOTE: Ignoring logic where curr == promise or prev == promise bc # smartDMA is always true so we'd return near the beginning of the method @@ -302,17 +304,16 @@ class Driver: else: lookup_key = "_" + precision_type + str(op_number) + "_" - print(curr_layer) if curr_layer == Driver.PrecisionTypes.FP32: time_key = "h2f%stime" % lookup_key energy_key = "h2f%senergy" % lookup_key elif curr_layer == Driver.PrecisionTypes.FP16: time_key = "f2h%stime" % lookup_key energy_key = "f2h%senergy" % lookup_key + print(tensor_op_row) time = tensor_op_row[time_key] energy = tensor_op_row[energy_key] print(time_key, energy_key) - print("Quantization: (%f, %f)" % (time, energy)) return (time, energy) @@ -330,7 +331,7 @@ class Driver: elif Driver.is_fc(layer_name): rows_a = layer_data["RA"] cols_a = layer_data["CA"] - rows_b = cols_ + rows_b = layer_data["RB"] cols_b = layer_data["CB"] else: print("PROMISE can't run whatever this layer is.") @@ -349,18 +350,17 @@ class Driver: def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, \ approx_type = None, knob_number = None): tensor_info = self.__tensor_table[layer_name][tensor_ind] - #print(tensor_info) - #print(layer_name) - #print(tensor_ind) time_key = None energy_key = None - if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP: # fp16_perf2_energy + if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP or approx_type == Driver.ApproxTypes.REDUCE: # fp16_perf2_energy approx_type_str = None if approx_type == Driver.ApproxTypes.PERF: approx_type_str = "perf" elif approx_type == Driver.ApproxTypes.SAMP: approx_type_str = "samp" + elif approx_type == Driver.ApproxTypes.REDUCE: + approx_type_str = "reduce" if curr_layer == Driver.PrecisionTypes.FP32: time_key = "fp32_%s%s_time" % (approx_type_str, knob_number) @@ -414,7 +414,7 @@ class Driver: conf_str.append("-----\n") results_file.write('\n'.join(conf_str)) - baseline_conf = None + fp32_baseline_conf = None baseline_total_time = baseline_total_energy = 0 def get_baseline_times_energies(conf): @@ -427,7 +427,7 @@ class Driver: def get_final_times_energies_conf(curr_conf, curr_conf_name): final_time = final_energy = 0 - + final_conf = [] # List (conf) of lists (layers) of tuples (operation data) #for hardware, layer in self.fp16_baseline: @@ -440,24 +440,48 @@ class Driver: final_conf_layer.append((None, None, tensor_op)) continue # layer name, operation name, val name - baseline_time = self.fp16_baseline[layer_ind][1][tensor_ind][0] - baseline_energy = self.fp16_baseline[layer_ind][1][tensor_ind][1] - baseline_op = self.fp16_baseline[layer_ind][1][tensor_ind][2] - #print(baseline_time, baseline_energy, baseline_op) - #print(op_time, tensor_op) + if tensor_op.find("promise") != -1: # compute sum of entire fp16 baseline layer + baseline_time = 0 + baseline_energy = 0 + baseline_op = [] + + if tensor_op.find("fp32") != -1: + assert False + baseline_layer = fp32_baseline_conf[layer_ind][1] + else: + baseline_layer = self.fp16_baseline[layer_ind][1] + + for op_time, op_energy, tensor_op in baseline_layer: + baseline_time += op_time + baseline_energy += op_energy + baseline_op.append(tensor_op) + else: # look at the individual tensor operation as before + if tensor_op.find("fp32") != -1: + assert False + baseline_layer = fp32_baseline_conf[1][layer_ind] + else: + baseline_layer = self.fp16_baseline[layer_ind][1] + baseline_time = baseline_layer[tensor_ind][0] + baseline_energy = baseline_layer[tensor_ind][1] + baseline_op = baseline_layer[tensor_ind][2] + final_tensor_op = tensor_op - #print(op_time > baseline_time) if op_time > baseline_time: - #print("**************** BIGGER ******************") - #print(curr_conf_name) - #print(baseline_time, baseline_energy, baseline_op, layer_ind) - #print(op_time, tensor_op, layer_ind) + print("**************** BIGGER ******************") + print(curr_conf_name) + print(baseline_time, baseline_energy, baseline_op, layer_ind) + print(op_time, tensor_op, layer_ind) final_time += baseline_time final_energy += baseline_energy final_tensor_op = baseline_op else: + print("**************** SMALLER ******************") + print(curr_conf_name) + print(baseline_time, baseline_energy, baseline_op, layer_ind) + print(op_time, tensor_op, layer_ind) final_time += op_time final_energy += op_energy + final_conf_layer.append((None, None, final_tensor_op)) # Don't care about the times and energies when writing final_conf.append((hardware, final_conf_layer)) #print("\n") @@ -470,15 +494,15 @@ class Driver: orig_line_lst = line.split(' ') conf_name = orig_line_lst[0] - if not baseline_conf: - baseline_conf = self.__conf_results[conf_index] #conf_name] - baseline_total_time, baseline_total_energy = get_baseline_times_energies(baseline_conf) + if not fp32_baseline_conf: + fp32_baseline_conf = self.__conf_results[conf_index] #conf_name] + baseline_total_time, baseline_total_energy = get_baseline_times_energies(fp32_baseline_conf) results_file.write("%s\n" % repr(baseline_total_time)) - write_conf_to_file(conf_name, baseline_conf, 1, 1) + write_conf_to_file(conf_name, fp32_baseline_conf, 1, 1) else: curr_conf = self.__conf_results[conf_index] #conf_name] - #final_time, final_energy, = get_baseline_times_energies(curr_conf) final_time, final_energy, curr_conf = get_final_times_energies_conf(curr_conf, conf_name) + print("Baseline time: %f, final time: %f, baseline energy: %f, final energy: %f, rations: %f %f " % (baseline_total_time, final_time, baseline_total_energy, final_energy, baseline_total_time / final_time, baseline_total_energy / final_energy)) write_conf_to_file(conf_name, curr_conf, baseline_total_time / final_time, baseline_total_energy / final_energy) conf_index += 1 results_file.close() diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt new file mode 100644 index 0000000000000000000000000000000000000000..20b92832d433de5c65f50c946c50153e1d3eebc9 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt @@ -0,0 +1,904 @@ ++++++ +conf1 1 0 99.69 0 +1 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 +2 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 +3 gpu mul fp32 1 add fp32 1 tanh fp32 1 +4 gpu mul fp32 1 add fp32 1 tanh fp32 1 +5 gpu softmax fp32 1 +----- ++++++ +conf1 2.01610051566 0 99.400002 0.6899979999999971 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf2 2.01610051566 0 99.040001 0.974998499999991 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf3 2.00016617632 0 99.68 0.4099999999999909 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf4 2.00016617632 0 99.660004 0.42999599999999705 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf5 1.97610564729 0 99.599998 0.4900019999999984 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf6 2.00016617632 0 99.599998 0.4900019999999984 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf7 2.00016617632 0 99.080002 0.9149970000000067 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf8 2.00016617632 0 99.239998 0.6750029999999967 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf9 2.00016617632 0 99.199997 0.7350045000000023 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf10 1.99590274244 0 99.099998 0.8850029999999975 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf11 2.01610051566 0 99.559998 0.5300020000000046 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf12 1.99590274244 0 99.540001 0.549998999999994 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf13 2.00016617632 0 99.639999 0.45000099999999466 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf14 1.99590274244 0 99.580002 0.5099980000000045 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf15 2.01610051566 0 99.099998 0.8850029999999975 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf16 2.01610051566 0 99.160004 0.7949939999999955 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf17 2.00016617632 0 99.379997 0.46500449999999205 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf18 1.99590274244 0 99.639999 0.45000099999999466 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf19 2.01610051566 0 99.580002 0.5099980000000045 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf20 1.97610564729 0 99.660004 0.42999599999999705 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf21 1.99590274244 0 99.440002 0.6499979999999909 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf22 1.99590274244 0 99.260002 0.6449969999999965 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf23 2.00016617632 0 99.360001 0.49499850000000123 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf24 2.01610051566 0 99.32 0.5550000000000068 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf25 2.00016617632 0 99.519997 0.5700029999999942 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf26 1.97610564729 0 99.379997 0.46500449999999205 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf27 2.01610051566 0 99.68 0.4099999999999909 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf28 2.00016617632 0 99.559998 0.5300020000000046 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf29 2.00016617632 0 99.080002 0.9149970000000067 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf30 1.97610564729 0 99.660004 0.42999599999999705 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf31 2.01610051566 0 99.599998 0.4900019999999984 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf32 1.97610564729 0 99.080002 0.9149970000000067 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf33 2.01610051566 0 99.620003 0.4699970000000008 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf34 2.00016617632 0 99.620003 0.4699970000000008 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf35 2.00016617632 0 99.599998 0.4900019999999984 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf36 1.99590274244 0 99.599998 0.4900019999999984 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf37 2.01610051566 0 99.540001 0.549998999999994 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf38 2.00016617632 0 99.339996 0.5250059999999976 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf39 2.00016617632 0 99.599998 0.4900019999999984 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf40 1.97610564729 0 99.379997 0.46500449999999205 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf41 2.00016617632 0 99.559998 0.5300020000000046 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf42 1.99590274244 0 99.459999 0.6300010000000015 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf43 1.99590274244 0 99.400002 0.6899979999999971 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf44 2.00016617632 0 99.599998 0.4900019999999984 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf45 2.01610051566 0 99.599998 0.4900019999999984 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf46 2.01610051566 0 99.080002 0.9149970000000067 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf47 2.01610051566 0 99.660004 0.42999599999999705 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf48 2.00016617632 0 99.639999 0.45000099999999466 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf49 2.01610051566 0 99.480003 0.6099970000000013 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf50 2.00016617632 0 98.400002 1.9349969999999956 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf51 2.01610051566 0 98.540001 1.724998499999991 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf52 2.01610051566 0 99.080002 0.9149970000000067 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf53 2.00016617632 0 99.660004 0.42999599999999705 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf54 2.01610051566 0 99.660004 0.42999599999999705 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf55 1.97610564729 0 99.599998 0.4900019999999984 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf56 2.01610051566 0 98.900002 1.1849969999999956 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf57 1.99590274244 0 99.099998 0.8850029999999975 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf58 2.01610051566 0 99.580002 0.5099980000000045 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf59 1.97610564729 0 99.080002 0.9149970000000067 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf60 2.01610051566 0 98.959999 1.0950015000000022 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf61 2.01610051566 0 99.220001 0.7049985000000021 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf62 2.01610051566 0 98.839996 1.2750059999999976 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf63 1.99590274244 0 98.940002 1.1249969999999863 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf64 1.97610564729 0 99.379997 0.46500449999999205 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf65 2.00016617632 0 99.559998 0.5300020000000046 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf66 2.00016617632 0 99.239998 0.6750029999999967 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf67 2.01610051566 0 99.459999 0.6300010000000015 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf68 2.00016617632 0 99.360001 0.49499850000000123 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf69 2.01610051566 0 99.559998 0.5300020000000046 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf70 1.99590274244 0 99.440002 0.6499979999999909 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf71 2.00016617632 0 99.339996 0.5250059999999976 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf72 2.01610051566 0 99.32 0.5550000000000068 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf73 1.97610564729 0 99.379997 0.46500449999999205 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf74 2.00016617632 0 99.019997 1.0050044999999912 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf75 1.99590274244 0 99.260002 0.6449969999999965 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf76 2.01610051566 0 99.099998 0.8850029999999975 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf77 1.97610564729 0 98.440002 1.8749969999999863 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf78 2.01610051566 0 98.440002 1.8749969999999863 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf79 2.01610051566 0 99.160004 0.7949939999999955 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf80 1.97610564729 0 98.480003 1.814995500000002 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf81 2.00016617632 0 99.360001 0.49499850000000123 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf82 1.97610564729 0 99.660004 0.42999599999999705 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf83 1.99590274244 0 99.540001 0.549998999999994 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf84 2.00016617632 0 99.199997 0.7350045000000023 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf85 1.97610564729 0 98.440002 1.8749969999999863 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf86 2.00016617632 0 99.0 1.0349999999999966 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf87 1.99590274244 0 98.519997 1.7550044999999912 +1 gpu conv samp 35 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf88 2.01610051566 0 99.400002 0.6899979999999971 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf89 2.01610051566 0 97.760002 2.8949969999999965 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf90 2.01610051566 0 99.519997 0.5700029999999942 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf91 2.01610051566 0 99.32 0.5550000000000068 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf92 2.01610051566 0 99.580002 0.5099980000000045 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf93 2.01610051566 0 99.480003 0.6099970000000013 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf94 2.01610051566 0 98.480003 1.814995500000002 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf95 2.01610051566 0 98.540001 1.724998499999991 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf96 2.01610051566 0 97.82 2.805000000000007 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf97 2.01610051566 0 98.959999 1.0950015000000022 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf98 2.01610051566 0 98.459999 1.8450015000000022 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf99 2.01610051566 0 99.660004 0.42999599999999705 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf100 2.01610051566 0 99.620003 0.4699970000000008 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf101 2.01610051566 0 97.699997 2.9850045000000023 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf102 2.01610051566 0 99.040001 0.974998499999991 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf103 2.01610051566 0 98.0 2.5349999999999966 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf104 2.01610051566 0 99.160004 0.7949939999999955 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf105 2.01610051566 0 99.540001 0.549998999999994 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf106 2.01610051566 0 99.519997 0.5700029999999942 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf107 2.01610051566 0 99.099998 0.8850029999999975 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf108 2.01610051566 0 98.120003 2.354995500000001 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf109 2.01610051566 0 99.459999 0.6300010000000015 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf110 2.01610051566 0 99.68 0.4099999999999909 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf111 2.01610051566 0 98.839996 1.2750059999999976 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf112 2.01610051566 0 98.18 2.2649999999999864 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile index 3e4f668a2c157b3c6a2abcea9da19819f6dabaef..578cfc713eef378bfb23222b4ed3e8b1abd7e7d9 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile @@ -1,5 +1,5 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks -# NOTE: can configure build directory +# NOTE: CHANGE to your BUILD DIRECTORY HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_dsoc/ CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -22,8 +22,6 @@ DNN_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/dnn_sources/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_RT_SRC_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/src -# -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0 -# -I $(TENSOR_INCLUDE_DIR) CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(DNN_INCLUDE_DIR) -I $(COMMON_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 LINKER_FLAGS = -lpthread -lOpenCL @@ -34,10 +32,10 @@ OPTFLAGS1 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInP OPTFLAGS2 = -load $(HPVM_LIB_DIR)/InlineTensorCalls.so -inline-tensor-calls -TARGET = $(BUILD_DIR)/$(APP).opt.bc +TARGET = $(BUILD_DIR)/$(APP).final.bc + SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-cpu-rt/visc-rt.ll -#VISC_RT_PATH = $(HPVM_BUILD_DIR)/projects/visc-rt/visc-rt.ll .PRECIOUS: $(BUILD_DIR)/$(APP).ll $(BUILD_DIR)/$(APP).visc.ll @@ -54,17 +52,19 @@ $(BUILD_DIR)/%.visc.ll: $(BUILD_DIR)/%.ll expanded_modules:= $(wildcard *_module.ll) - -#$(wildcard build/_*.ll) - $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.visc.ll $(OPT) $(OPTFLAGS1) $< -o $@ + + +$(BUILD_DIR)/%.linked.bc: $(BUILD_DIR)/%.opt.bc $(CC) -emit-llvm -c $(TENSOR_RT_SRC_DIR)/tensor_cpu_runtime.cc -o $(BUILD_DIR)/tensor_cpu_runtime.bc $(OPT) -always-inline $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/tensor_cpu_runtime.bc - #LL_FILES = $(shell cd build; find ./ -name "*module.ll") - $(LLVM_LINK) $@ $(shell find ./build -name "*module.ll") $(BUILD_DIR)/tensor_cpu_runtime.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/lenet_tensor_rt.bc - $(OPT) $(OPTFLAGS2) $(BUILD_DIR)/lenet_tensor_rt.bc -o $(BUILD_DIR)/lenet_inline.bc - $(CC) $(BUILD_DIR)/lenet_inline.bc -o $(BUILD_DIR)/lenet_final $(LINKER_FLAGS) + $(LLVM_LINK) $< $(shell find ./build -name "*module.ll") $(BUILD_DIR)/tensor_cpu_runtime.bc $(VISC_RT_PATH) -o $@ + + +$(BUILD_DIR)/%.final.bc: $(BUILD_DIR)/%.linked.bc + $(OPT) $(OPTFLAGS2) $< -o $@ + $(CC) $@ -o $(BUILD_DIR)/$(APP)_final $(LINKER_FLAGS) $(foreach module, $(expanded_modules), $(LLVM_LINK) $(module) $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/$(module)_linked ${\n} $(OPT) $(OPTFLAGS2) $(BUILD_DIR)/$(module)_linked -o $(BUILD_DIR)/$(module)_inline ${\n} ) @@ -74,3 +74,6 @@ $(BUILD_DIR): clean: rm -rf $(BUILD_DIR) + + + diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt index 2e3185632ca5cd156a599f4e0a7999c16fd4be97..707fd70be086b8961875c2cfd94ba1f41d2ac208 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt @@ -1,896 +1,904 @@ +++++ +conf1 1 0 99.69 0 +1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu mul fp32 1 add fp32 1 tanh fp32 1 +4 gpu mul fp32 1 add fp32 1 tanh fp32 1 +5 gpu softmax fp32 1 +----- ++++++ conf1 2.01610051566 0 99.400002 0.6899979999999971 -1 gpu conv perf 21 add fp32 1 pool_max fp32 1 tanh fp32 1 -2 gpu conv perf 21 add fp32 1 pool_max fp32 1 tanh fp32 1 -3 gpu mul fp16 1 add fp32 1 tanh fp32 1 -4 gpu mul fp16 1 add fp32 1 tanh fp32 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu mul fp16 1 add fp16 1 tanh fp16 1 +4 gpu mul fp16 1 add fp16 1 tanh fp16 1 5 gpu softmax fp32 1 ----- +++++ conf2 2.01610051566 0 99.040001 0.974998499999991 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf3 2.00016617632 0 99.68 0.4099999999999909 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf4 2.00016617632 0 99.660004 0.42999599999999705 -1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf5 1.97610564729 0 99.599998 0.4900019999999984 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf6 2.00016617632 0 99.599998 0.4900019999999984 -1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf7 2.00016617632 0 99.080002 0.9149970000000067 -1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf8 2.00016617632 0 99.239998 0.6750029999999967 -1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf9 2.00016617632 0 99.199997 0.7350045000000023 -1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf10 1.99590274244 0 99.099998 0.8850029999999975 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf11 2.01610051566 0 99.559998 0.5300020000000046 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf12 1.99590274244 0 99.540001 0.549998999999994 -1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf13 2.00016617632 0 99.639999 0.45000099999999466 -1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf14 1.99590274244 0 99.580002 0.5099980000000045 -1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf15 2.01610051566 0 99.099998 0.8850029999999975 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf16 2.01610051566 0 99.160004 0.7949939999999955 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf17 2.00016617632 0 99.379997 0.46500449999999205 -1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf18 1.99590274244 0 99.639999 0.45000099999999466 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf19 2.01610051566 0 99.580002 0.5099980000000045 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf20 1.97610564729 0 99.660004 0.42999599999999705 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf21 1.99590274244 0 99.440002 0.6499979999999909 -1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf22 1.99590274244 0 99.260002 0.6449969999999965 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf23 2.00016617632 0 99.360001 0.49499850000000123 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf24 2.01610051566 0 99.32 0.5550000000000068 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf25 2.00016617632 0 99.519997 0.5700029999999942 -1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf26 1.97610564729 0 99.379997 0.46500449999999205 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf27 2.01610051566 0 99.68 0.4099999999999909 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf28 2.00016617632 0 99.559998 0.5300020000000046 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf29 2.00016617632 0 99.080002 0.9149970000000067 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf30 1.97610564729 0 99.660004 0.42999599999999705 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf31 2.01610051566 0 99.599998 0.4900019999999984 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf32 1.97610564729 0 99.080002 0.9149970000000067 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf33 2.01610051566 0 99.620003 0.4699970000000008 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf34 2.00016617632 0 99.620003 0.4699970000000008 -1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf35 2.00016617632 0 99.599998 0.4900019999999984 -1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf36 1.99590274244 0 99.599998 0.4900019999999984 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf37 2.01610051566 0 99.540001 0.549998999999994 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf38 2.00016617632 0 99.339996 0.5250059999999976 -1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf39 2.00016617632 0 99.599998 0.4900019999999984 -1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf40 1.97610564729 0 99.379997 0.46500449999999205 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf41 2.00016617632 0 99.559998 0.5300020000000046 -1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf42 1.99590274244 0 99.459999 0.6300010000000015 -1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf43 1.99590274244 0 99.400002 0.6899979999999971 -1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf44 2.00016617632 0 99.599998 0.4900019999999984 -1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf45 2.01610051566 0 99.599998 0.4900019999999984 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf46 2.01610051566 0 99.080002 0.9149970000000067 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf47 2.01610051566 0 99.660004 0.42999599999999705 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf48 2.00016617632 0 99.639999 0.45000099999999466 -1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf49 2.01610051566 0 99.480003 0.6099970000000013 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf50 2.00016617632 0 98.400002 1.9349969999999956 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf51 2.01610051566 0 98.540001 1.724998499999991 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf52 2.01610051566 0 99.080002 0.9149970000000067 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf53 2.00016617632 0 99.660004 0.42999599999999705 -1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf54 2.01610051566 0 99.660004 0.42999599999999705 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf55 1.97610564729 0 99.599998 0.4900019999999984 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf56 2.01610051566 0 98.900002 1.1849969999999956 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf57 1.99590274244 0 99.099998 0.8850029999999975 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf58 2.01610051566 0 99.580002 0.5099980000000045 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf59 1.97610564729 0 99.080002 0.9149970000000067 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf60 2.01610051566 0 98.959999 1.0950015000000022 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf61 2.01610051566 0 99.220001 0.7049985000000021 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf62 2.01610051566 0 98.839996 1.2750059999999976 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf63 1.99590274244 0 98.940002 1.1249969999999863 -1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf64 1.97610564729 0 99.379997 0.46500449999999205 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf65 2.00016617632 0 99.559998 0.5300020000000046 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf66 2.00016617632 0 99.239998 0.6750029999999967 -1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf67 2.01610051566 0 99.459999 0.6300010000000015 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf68 2.00016617632 0 99.360001 0.49499850000000123 -1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf69 2.01610051566 0 99.559998 0.5300020000000046 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf70 1.99590274244 0 99.440002 0.6499979999999909 -1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf71 2.00016617632 0 99.339996 0.5250059999999976 -1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf72 2.01610051566 0 99.32 0.5550000000000068 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf73 1.97610564729 0 99.379997 0.46500449999999205 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf74 2.00016617632 0 99.019997 1.0050044999999912 -1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf75 1.99590274244 0 99.260002 0.6449969999999965 -1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf76 2.01610051566 0 99.099998 0.8850029999999975 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf77 1.97610564729 0 98.440002 1.8749969999999863 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf78 2.01610051566 0 98.440002 1.8749969999999863 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf79 2.01610051566 0 99.160004 0.7949939999999955 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf80 1.97610564729 0 98.480003 1.814995500000002 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf81 2.00016617632 0 99.360001 0.49499850000000123 -1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf82 1.97610564729 0 99.660004 0.42999599999999705 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf83 1.99590274244 0 99.540001 0.549998999999994 -1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf84 2.00016617632 0 99.199997 0.7350045000000023 -1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf85 1.97610564729 0 98.440002 1.8749969999999863 -1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf86 2.00016617632 0 99.0 1.0349999999999966 -1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf87 1.99590274244 0 98.519997 1.7550044999999912 -1 gpu conv samp 35 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 35 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf88 2.01610051566 0 99.400002 0.6899979999999971 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf89 2.01610051566 0 97.760002 2.8949969999999965 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf90 2.01610051566 0 99.519997 0.5700029999999942 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf91 2.01610051566 0 99.32 0.5550000000000068 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf92 2.01610051566 0 99.580002 0.5099980000000045 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf93 2.01610051566 0 99.480003 0.6099970000000013 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf94 2.01610051566 0 98.480003 1.814995500000002 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf95 2.01610051566 0 98.540001 1.724998499999991 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf96 2.01610051566 0 97.82 2.805000000000007 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf97 2.01610051566 0 98.959999 1.0950015000000022 -1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf98 2.01610051566 0 98.459999 1.8450015000000022 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf99 2.01610051566 0 99.660004 0.42999599999999705 -1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf100 2.01610051566 0 99.620003 0.4699970000000008 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf101 2.01610051566 0 97.699997 2.9850045000000023 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf102 2.01610051566 0 99.040001 0.974998499999991 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf103 2.01610051566 0 98.0 2.5349999999999966 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf104 2.01610051566 0 99.160004 0.7949939999999955 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf105 2.01610051566 0 99.540001 0.549998999999994 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf106 2.01610051566 0 99.519997 0.5700029999999942 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf107 2.01610051566 0 99.099998 0.8850029999999975 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf108 2.01610051566 0 98.120003 2.354995500000001 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf109 2.01610051566 0 99.459999 0.6300010000000015 -1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf110 2.01610051566 0 99.68 0.4099999999999909 -1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf111 2.01610051566 0 98.839996 1.2750059999999976 -1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- +++++ conf112 2.01610051566 0 98.18 2.2649999999999864 -1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 -2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 +1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 -5 gpu softmax fp16 1 +5 gpu softmax fp32 1 ----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt index 948efe5bd7586727c5fe4fa7ccc73e7319bf97d6..4a14a5f2e45c83a2960deccbcd0296a6d9a2f2bc 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt @@ -1,87 +1,87 @@ +++++ conf1 1 0 83.5 0 1 gpu conv fp32 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 7 gpu conv fp32 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 13 gpu conv fp32 1 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 19 gpu conv fp32 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 25 gpu conv fp32 1 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 31 gpu conv fp32 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 37 gpu conv fp32 1 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 43 gpu conv fp32 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 49 gpu conv fp32 1 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 55 gpu conv fp32 1 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 61 gpu conv fp32 1 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 67 gpu conv fp32 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 73 gpu conv fp32 1 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 79 gpu conv fp32 1 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 83 gpu mul fp32 1 add fp32 1 84 gpu softmax fp32 1 ----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt index 66833d06b3af9ad7c4bcefdbea9c2e977eeea378..86b061f3d9ff5b75a9580ae65afd9ff6c20f9701 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt @@ -1,87 +1,87 @@ +++++ conf1 1 0 83.5 0 1 gpu conv fp32 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 7 gpu conv fp32 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 13 gpu conv fp32 1 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 19 gpu conv fp32 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 25 gpu conv fp32 1 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 31 gpu conv fp32 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 37 gpu conv fp32 1 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 43 gpu conv fp32 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 49 gpu conv fp32 1 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 55 gpu conv fp32 1 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 61 gpu conv fp32 1 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 67 gpu conv fp32 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 73 gpu conv fp32 1 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 79 gpu conv fp32 1 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 83 gpu mul fp32 1 add fp32 1 84 gpu softmax fp32 1 ----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt index baffc185452ce288432fa55e3d8ad7ced9ff44d2..3b628d570fcb1884cfa10371a2aaf6856a652d1e 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt @@ -1,88 +1,88 @@ +++++ -conf1 4.15413017186 0 83.163334475 0.5049982875000012 -1 gpu conv fp16 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 -7 gpu conv fp16 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 -13 promise swing_level 5 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp16 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 -25 promise swing_level 7 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 -31 gpu conv fp16 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 -37 promise swing_level 5 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 -43 gpu conv fp16 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 -49 gpu conv perf 25 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 -55 gpu conv perf 24 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 -61 promise swing_level 5 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 -67 gpu conv fp16 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 -73 promise swing_level 6 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 -79 promise swing_level 5 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 -83 promise swing_level 3 +conf1 1 0 83.5 0 +1 gpu conv fp32 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 +7 gpu conv fp32 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 +13 gpu conv fp32 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 +25 gpu conv fp32 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 +31 gpu conv fp32 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 +37 gpu conv fp32 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 +43 gpu conv fp32 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 +49 gpu conv fp32 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 +55 gpu conv fp32 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 +61 gpu conv fp32 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 +67 gpu conv fp32 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 +73 gpu conv fp32 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 +79 gpu conv fp32 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 +83 gpu mul fp32 1 add fp32 1 84 gpu softmax fp32 1 ----- +++++ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt index fa5a561bf6fd17c4b2ce372884ac02524ce135f5..ff7fdbf108c1cbca0154d6c300cd3ebbdaf7cd6d 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt @@ -1,87 +1,87 @@ +++++ conf1 1 0 83.5 0 1 gpu conv fp32 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 7 gpu conv fp32 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 13 gpu conv fp32 1 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 19 gpu conv fp32 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 25 gpu conv fp32 1 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 31 gpu conv fp32 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 37 gpu conv fp32 1 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 43 gpu conv fp32 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 49 gpu conv fp32 1 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 55 gpu conv fp32 1 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 61 gpu conv fp32 1 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 67 gpu conv fp32 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 73 gpu conv fp32 1 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 79 gpu conv fp32 1 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 83 gpu mul fp32 1 add fp32 1 84 gpu softmax fp32 1 ----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt index bf55690f22ad5f9a3de72bd16d4907d8099512a9..04d1491bc7ddcfd94ce837cc830fa0874496842e 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt @@ -1,87 +1,87 @@ +++++ conf1 1 0 83.5 0 1 gpu conv fp32 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 +2 gpu batchnorm fp32 1 +3 gpu relu fp32 1 +4 gpu group_conv fp32 1 +5 gpu batchnorm fp32 1 +6 gpu relu fp32 1 7 gpu conv fp32 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 +8 gpu batchnorm fp32 1 +9 gpu relu fp32 1 +10 gpu group_conv fp32 1 +11 gpu batchnorm fp32 1 +12 gpu relu fp32 1 13 gpu conv fp32 1 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 +14 gpu batchnorm fp32 1 +15 gpu relu fp32 1 +16 gpu group_conv fp32 1 +17 gpu batchnorm fp32 1 +18 gpu relu fp32 1 19 gpu conv fp32 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 +20 gpu batchnorm fp32 1 +21 gpu relu fp32 1 +22 gpu group_conv fp32 1 +23 gpu batchnorm fp32 1 +24 gpu relu fp32 1 25 gpu conv fp32 1 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 +26 gpu batchnorm fp32 1 +27 gpu relu fp32 1 +28 gpu group_conv fp32 1 +29 gpu batchnorm fp32 1 +30 gpu relu fp32 1 31 gpu conv fp32 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 +32 gpu batchnorm fp32 1 +33 gpu relu fp32 1 +34 gpu group_conv fp32 1 +35 gpu batchnorm fp32 1 +36 gpu relu fp32 1 37 gpu conv fp32 1 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 +38 gpu batchnorm fp32 1 +39 gpu relu fp32 1 +40 gpu group_conv fp32 1 +41 gpu batchnorm fp32 1 +42 gpu relu fp32 1 43 gpu conv fp32 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 +44 gpu batchnorm fp32 1 +45 gpu relu fp32 1 +46 gpu group_conv fp32 1 +47 gpu batchnorm fp32 1 +48 gpu relu fp32 1 49 gpu conv fp32 1 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 +50 gpu batchnorm fp32 1 +51 gpu relu fp32 1 +52 gpu group_conv fp32 1 +53 gpu batchnorm fp32 1 +54 gpu relu fp32 1 55 gpu conv fp32 1 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 +56 gpu batchnorm fp32 1 +57 gpu relu fp32 1 +58 gpu group_conv fp32 1 +59 gpu batchnorm fp32 1 +60 gpu relu fp32 1 61 gpu conv fp32 1 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 +62 gpu batchnorm fp32 1 +63 gpu relu fp32 1 +64 gpu group_conv fp32 1 +65 gpu batchnorm fp32 1 +66 gpu relu fp32 1 67 gpu conv fp32 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 +68 gpu batchnorm fp32 1 +69 gpu relu fp32 1 +70 gpu group_conv fp32 1 +71 gpu batchnorm fp32 1 +72 gpu relu fp32 1 73 gpu conv fp32 1 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 +74 gpu batchnorm fp32 1 +75 gpu relu fp32 1 +76 gpu group_conv fp32 1 +77 gpu batchnorm fp32 1 +78 gpu relu fp32 1 79 gpu conv fp32 1 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 +80 gpu batchnorm fp32 1 +81 gpu relu fp32 1 +82 gpu pool_mean fp32 1 83 gpu mul fp32 1 add fp32 1 84 gpu softmax fp32 1 ----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt index 99aac992148120d1b4c9937b4d5464a137806d4a..5a0463b97eb4e36e097cfcef5383474e85ab6076 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt @@ -1,46 +1,46 @@ +++++ conf1 1 0 89.4 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 -3 gpu conv fp32 1 add fp32 1 -4 gpu add fp16 1 -5 gpu relu fp16 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 -8 gpu add fp16 1 -9 gpu relu fp16 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 -11 gpu conv fp32 1 add fp32 1 -12 gpu add fp16 1 -13 gpu relu fp16 1 -14 gpu conv fp32 1 add fp32 1 relu fp32 1 -15 gpu conv fp32 1 add fp32 1 -16 gpu conv fp32 1 add fp32 1 -17 gpu add fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp32 1 add fp32 1 relu fp32 1 -20 gpu conv fp32 1 add fp32 1 -21 gpu add fp16 1 -22 gpu relu fp16 1 -23 gpu conv fp32 1 add fp32 1 relu fp32 1 -24 gpu conv fp32 1 add fp32 1 -25 gpu add fp16 1 -26 gpu relu fp16 1 -27 gpu conv fp32 1 add fp32 1 relu fp32 1 -28 gpu conv fp32 1 add fp32 1 -29 gpu conv fp32 1 add fp32 1 -30 gpu add fp16 1 -31 gpu relu fp16 1 -32 gpu conv fp32 1 add fp32 1 relu fp32 1 -33 gpu conv fp32 1 add fp32 1 -34 gpu add fp16 1 -35 gpu relu fp16 1 -36 gpu conv fp32 1 add fp32 1 relu fp32 1 -37 gpu conv fp32 1 add fp32 1 -38 gpu add fp16 1 -39 gpu relu fp16 1 -40 gpu pool_max fp16 1 -41 gpu mul fp32 1 add fp32 1 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 +3 gpu conv fp32 1 add fp32 1 +4 gpu add fp32 1 +5 gpu relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 +8 gpu add fp32 1 +9 gpu relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 +11 gpu conv fp32 1 add fp32 1 +12 gpu add fp32 1 +13 gpu relu fp32 1 +14 gpu conv fp32 1 add fp32 1 relu fp32 1 +15 gpu conv fp32 1 add fp32 1 +16 gpu conv fp32 1 add fp32 1 +17 gpu add fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 add fp32 1 relu fp32 1 +20 gpu conv fp32 1 add fp32 1 +21 gpu add fp32 1 +22 gpu relu fp32 1 +23 gpu conv fp32 1 add fp32 1 relu fp32 1 +24 gpu conv fp32 1 add fp32 1 +25 gpu add fp32 1 +26 gpu relu fp32 1 +27 gpu conv fp32 1 add fp32 1 relu fp32 1 +28 gpu conv fp32 1 add fp32 1 +29 gpu conv fp32 1 add fp32 1 +30 gpu add fp32 1 +31 gpu relu fp32 1 +32 gpu conv fp32 1 add fp32 1 relu fp32 1 +33 gpu conv fp32 1 add fp32 1 +34 gpu add fp32 1 +35 gpu relu fp32 1 +36 gpu conv fp32 1 add fp32 1 relu fp32 1 +37 gpu conv fp32 1 add fp32 1 +38 gpu add fp32 1 +39 gpu relu fp32 1 +40 gpu pool_max fp32 1 +41 gpu mul fp32 1 add fp32 1 42 gpu softmax fp32 1 ----- +++++ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt index eaafddc7dd76089812103a759497386dff80854c..ccc9576535cf2b1f05427fc4cb2247dbb0958c12 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt @@ -1,46 +1,46 @@ +++++ conf1 1 0 89.4 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 -3 gpu conv fp32 1 add fp32 1 -4 gpu add fp16 1 -5 gpu relu fp16 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 -8 gpu add fp16 1 -9 gpu relu fp16 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 -11 gpu conv fp32 1 add fp32 1 -12 gpu add fp16 1 -13 gpu relu fp16 1 -14 gpu conv fp32 1 add fp32 1 relu fp32 1 -15 gpu conv fp32 1 add fp32 1 -16 gpu conv fp32 1 add fp32 1 -17 gpu add fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp32 1 add fp32 1 relu fp32 1 -20 gpu conv fp32 1 add fp32 1 -21 gpu add fp16 1 -22 gpu relu fp16 1 -23 gpu conv fp32 1 add fp32 1 relu fp32 1 -24 gpu conv fp32 1 add fp32 1 -25 gpu add fp16 1 -26 gpu relu fp16 1 -27 gpu conv fp32 1 add fp32 1 relu fp32 1 -28 gpu conv fp32 1 add fp32 1 -29 gpu conv fp32 1 add fp32 1 -30 gpu add fp16 1 -31 gpu relu fp16 1 -32 gpu conv fp32 1 add fp32 1 relu fp32 1 -33 gpu conv fp32 1 add fp32 1 -34 gpu add fp16 1 -35 gpu relu fp16 1 -36 gpu conv fp32 1 add fp32 1 relu fp32 1 -37 gpu conv fp32 1 add fp32 1 -38 gpu add fp16 1 -39 gpu relu fp16 1 -40 gpu pool_max fp16 1 -41 gpu mul fp32 1 add fp32 1 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 +3 gpu conv fp32 1 add fp32 1 +4 gpu add fp32 1 +5 gpu relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 +8 gpu add fp32 1 +9 gpu relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 +11 gpu conv fp32 1 add fp32 1 +12 gpu add fp32 1 +13 gpu relu fp32 1 +14 gpu conv fp32 1 add fp32 1 relu fp32 1 +15 gpu conv fp32 1 add fp32 1 +16 gpu conv fp32 1 add fp32 1 +17 gpu add fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 add fp32 1 relu fp32 1 +20 gpu conv fp32 1 add fp32 1 +21 gpu add fp32 1 +22 gpu relu fp32 1 +23 gpu conv fp32 1 add fp32 1 relu fp32 1 +24 gpu conv fp32 1 add fp32 1 +25 gpu add fp32 1 +26 gpu relu fp32 1 +27 gpu conv fp32 1 add fp32 1 relu fp32 1 +28 gpu conv fp32 1 add fp32 1 +29 gpu conv fp32 1 add fp32 1 +30 gpu add fp32 1 +31 gpu relu fp32 1 +32 gpu conv fp32 1 add fp32 1 relu fp32 1 +33 gpu conv fp32 1 add fp32 1 +34 gpu add fp32 1 +35 gpu relu fp32 1 +36 gpu conv fp32 1 add fp32 1 relu fp32 1 +37 gpu conv fp32 1 add fp32 1 +38 gpu add fp32 1 +39 gpu relu fp32 1 +40 gpu pool_max fp32 1 +41 gpu mul fp32 1 add fp32 1 42 gpu softmax fp32 1 ----- +++++ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt index 793a33e54ea2e49a724fb5ee75de14d6be608725..fac96ced244ed77d41ed236c60a5aa0f0cc84c30 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt @@ -1,46 +1,46 @@ +++++ conf1 1 0 89.4 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 -3 gpu conv fp32 1 add fp32 1 -4 gpu add fp16 1 -5 gpu relu fp16 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 -8 gpu add fp16 1 -9 gpu relu fp16 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 -11 gpu conv fp32 1 add fp32 1 -12 gpu add fp16 1 -13 gpu relu fp16 1 -14 gpu conv fp32 1 add fp32 1 relu fp32 1 -15 gpu conv fp32 1 add fp32 1 -16 gpu conv fp32 1 add fp32 1 -17 gpu add fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp32 1 add fp32 1 relu fp32 1 -20 gpu conv fp32 1 add fp32 1 -21 gpu add fp16 1 -22 gpu relu fp16 1 -23 gpu conv fp32 1 add fp32 1 relu fp32 1 -24 gpu conv fp32 1 add fp32 1 -25 gpu add fp16 1 -26 gpu relu fp16 1 -27 gpu conv fp32 1 add fp32 1 relu fp32 1 -28 gpu conv fp32 1 add fp32 1 -29 gpu conv fp32 1 add fp32 1 -30 gpu add fp16 1 -31 gpu relu fp16 1 -32 gpu conv fp32 1 add fp32 1 relu fp32 1 -33 gpu conv fp32 1 add fp32 1 -34 gpu add fp16 1 -35 gpu relu fp16 1 -36 gpu conv fp32 1 add fp32 1 relu fp32 1 -37 gpu conv fp32 1 add fp32 1 -38 gpu add fp16 1 -39 gpu relu fp16 1 -40 gpu pool_max fp16 1 -41 gpu mul fp32 1 add fp32 1 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 +3 gpu conv fp32 1 add fp32 1 +4 gpu add fp32 1 +5 gpu relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 +8 gpu add fp32 1 +9 gpu relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 +11 gpu conv fp32 1 add fp32 1 +12 gpu add fp32 1 +13 gpu relu fp32 1 +14 gpu conv fp32 1 add fp32 1 relu fp32 1 +15 gpu conv fp32 1 add fp32 1 +16 gpu conv fp32 1 add fp32 1 +17 gpu add fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 add fp32 1 relu fp32 1 +20 gpu conv fp32 1 add fp32 1 +21 gpu add fp32 1 +22 gpu relu fp32 1 +23 gpu conv fp32 1 add fp32 1 relu fp32 1 +24 gpu conv fp32 1 add fp32 1 +25 gpu add fp32 1 +26 gpu relu fp32 1 +27 gpu conv fp32 1 add fp32 1 relu fp32 1 +28 gpu conv fp32 1 add fp32 1 +29 gpu conv fp32 1 add fp32 1 +30 gpu add fp32 1 +31 gpu relu fp32 1 +32 gpu conv fp32 1 add fp32 1 relu fp32 1 +33 gpu conv fp32 1 add fp32 1 +34 gpu add fp32 1 +35 gpu relu fp32 1 +36 gpu conv fp32 1 add fp32 1 relu fp32 1 +37 gpu conv fp32 1 add fp32 1 +38 gpu add fp32 1 +39 gpu relu fp32 1 +40 gpu pool_max fp32 1 +41 gpu mul fp32 1 add fp32 1 42 gpu softmax fp32 1 ----- +++++ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt index 714c965a13922470adbd5d44461c794fb3729b2f..0f0348b8f264eb606bb274cef7b5ba206e03c705 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt @@ -1,46 +1,46 @@ +++++ conf1 1 0 89.4 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 -3 gpu conv fp32 1 add fp32 1 -4 gpu add fp16 1 -5 gpu relu fp16 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 -8 gpu add fp16 1 -9 gpu relu fp16 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 -11 gpu conv fp32 1 add fp32 1 -12 gpu add fp16 1 -13 gpu relu fp16 1 -14 gpu conv fp32 1 add fp32 1 relu fp32 1 -15 gpu conv fp32 1 add fp32 1 -16 gpu conv fp32 1 add fp32 1 -17 gpu add fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp32 1 add fp32 1 relu fp32 1 -20 gpu conv fp32 1 add fp32 1 -21 gpu add fp16 1 -22 gpu relu fp16 1 -23 gpu conv fp32 1 add fp32 1 relu fp32 1 -24 gpu conv fp32 1 add fp32 1 -25 gpu add fp16 1 -26 gpu relu fp16 1 -27 gpu conv fp32 1 add fp32 1 relu fp32 1 -28 gpu conv fp32 1 add fp32 1 -29 gpu conv fp32 1 add fp32 1 -30 gpu add fp16 1 -31 gpu relu fp16 1 -32 gpu conv fp32 1 add fp32 1 relu fp32 1 -33 gpu conv fp32 1 add fp32 1 -34 gpu add fp16 1 -35 gpu relu fp16 1 -36 gpu conv fp32 1 add fp32 1 relu fp32 1 -37 gpu conv fp32 1 add fp32 1 -38 gpu add fp16 1 -39 gpu relu fp16 1 -40 gpu pool_max fp16 1 -41 gpu mul fp32 1 add fp32 1 +1 gpu conv fp32 1 add fp32 1 relu fp32 1 +2 gpu conv fp32 1 add fp32 1 relu fp32 1 +3 gpu conv fp32 1 add fp32 1 +4 gpu add fp32 1 +5 gpu relu fp32 1 +6 gpu conv fp32 1 add fp32 1 relu fp32 1 +7 gpu conv fp32 1 add fp32 1 +8 gpu add fp32 1 +9 gpu relu fp32 1 +10 gpu conv fp32 1 add fp32 1 relu fp32 1 +11 gpu conv fp32 1 add fp32 1 +12 gpu add fp32 1 +13 gpu relu fp32 1 +14 gpu conv fp32 1 add fp32 1 relu fp32 1 +15 gpu conv fp32 1 add fp32 1 +16 gpu conv fp32 1 add fp32 1 +17 gpu add fp32 1 +18 gpu relu fp32 1 +19 gpu conv fp32 1 add fp32 1 relu fp32 1 +20 gpu conv fp32 1 add fp32 1 +21 gpu add fp32 1 +22 gpu relu fp32 1 +23 gpu conv fp32 1 add fp32 1 relu fp32 1 +24 gpu conv fp32 1 add fp32 1 +25 gpu add fp32 1 +26 gpu relu fp32 1 +27 gpu conv fp32 1 add fp32 1 relu fp32 1 +28 gpu conv fp32 1 add fp32 1 +29 gpu conv fp32 1 add fp32 1 +30 gpu add fp32 1 +31 gpu relu fp32 1 +32 gpu conv fp32 1 add fp32 1 relu fp32 1 +33 gpu conv fp32 1 add fp32 1 +34 gpu add fp32 1 +35 gpu relu fp32 1 +36 gpu conv fp32 1 add fp32 1 relu fp32 1 +37 gpu conv fp32 1 add fp32 1 +38 gpu add fp32 1 +39 gpu relu fp32 1 +40 gpu pool_max fp32 1 +41 gpu mul fp32 1 add fp32 1 42 gpu softmax fp32 1 ----- +++++