diff --git a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp index 104b667fa76abac9eeb33cf82e6d4fdcd7734cb8..a44b79b3fe20cb52383ff7458466a72d041b90fc 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_CPU/DFG2LLVM_CPU.cpp @@ -1453,24 +1453,6 @@ void CGT_CPU::codeGen(DFLeafNode *N) { N->setTag(hpvm::CPU_TARGET); break; } - case hpvm::PROMISE_TARGET: { - errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; - // Make sure there is a generated x86 function for promise - assert(N->getGenFuncForTarget(hpvm::PROMISE_TARGET) && ""); - assert(N->hasCPUGenFuncForTarget(hpvm::PROMISE_TARGET) && ""); - // Store the PROMISE x86 function as the CPU generated function - Function *Ftmp = N->getGenFuncForTarget(N->getTag()); - // after adding the required number of arguments - if (!N->getParent()->isChildGraphStreaming()) { - Ftmp = addIdxDimArgs(Ftmp); - } - - N->removeGenFuncForTarget(hpvm::CUDNN_TARGET); - N->setTag(hpvm::None); - N->addGenFunc(Ftmp, hpvm::CPU_TARGET, true); - N->setTag(hpvm::CPU_TARGET); - break; - } case hpvm::TENSOR_TARGET: { errs() << "Promise hint found. Store PROMISE function as CPU funtion.\n"; diff --git a/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp b/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp index bd26a92fd343ce2057e5ab9c32c412681aef9da9..4653ad0f8a4c4ddd8d76e984b1750d9f94d813a9 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_CUDNN/DFG2LLVM_CUDNN.cpp @@ -1,4 +1,4 @@ -//=== DFG2LLVM_CUDNN.cpp ===// +//===------------------------- DFG2LLVM_CUDNN.cpp ------------------------ ===// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,14 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// This pass replaces the tensor operations in HPVM with appropriate API to +// the runtime, which leverages CuDNN library for implementing the supported +// tensor operations. +// +//===----------------------------------------------------------------------===// + + #define ENABLE_ASSERTS #define DEBUG_TYPE "DFG2LLVM_CUDNN" diff --git a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp index b400c12021d2df712ea0bbd04f03dbe8724abc75..3fa6860b9c7cb12b2f76c22196fa744cd052c2bb 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -1,4 +1,4 @@ -//=== DFG2LLVM_WrapperAPI.cpp ===// +//===------------------------- DFG2LLVM_Wrapper.cpp --------------------- ===// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,15 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// This pass is responsible for "fusing" multiple tensor operations in HPVM +// nodes so that the appropriate set of operations are replaced with a single +// call to a runtime routine. This allows the HPVM IR to represent a graph +// with tensor operations in a target-agnostic manner. +// +//===----------------------------------------------------------------------===// + + #define ENABLE_ASSERTS #define DEBUG_TYPE "DFG2LLVM_WrapperAPI" diff --git a/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp b/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp index 131a291a5b5a5f153985239effb97f5cf7f8e049..9b6ca06f631104d5d65711495e18f64babbcf6e7 100644 --- a/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp +++ b/hpvm/lib/Transforms/FuseHPVMTensorNodes/FuseHPVMTensorNodes.cpp @@ -1,4 +1,4 @@ -//=== FuseHPVMTensorNodes.cpp ===// +//===------------------------- FuseHPVMTensorNodes.cpp ------------------- ===// // // The LLVM Compiler Infrastructure // @@ -6,6 +6,16 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// This pass is uses fuses HPVM nodes based on the tensor operations contained +// the nodes. This helps create the groundwork for indicating to the compiler +// that a set of tensor operations in a node are fusionable and it can have +// implications on performance and energy consumption of set of tensor +// operations in question. +// +//===----------------------------------------------------------------------===// + + #define DEBUG_TYPE "FuseTensorNodes" diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt index b8a1716f2a7c18c7e1dcb8d907c3585707c22386..02ab62fca57f66155ffafff0686634b3efe4f861 100644 --- a/hpvm/projects/hpvm-rt/CMakeLists.txt +++ b/hpvm/projects/hpvm-rt/CMakeLists.txt @@ -6,18 +6,20 @@ SET(CMAKE_CXX_STANDARD 11) # Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY} if found find_package(OpenCL REQUIRED) -add_llvm_library(hpvm-rt.ll hpvm-rt.cpp - DEPENDS - clang - llvm-dis -) -target_compile_options(hpvm-rt.ll PUBLIC -flto) -target_include_directories(hpvm-rt.ll PRIVATE ${OpenCL_INCLUDE_DIRS}) -link_directories(${OpenCL_LIBRARY}) - -add_custom_target(hpvm-rt.cpp.o ALL - COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a - COMMAND mv ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.cpp.o ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc - COMMAND ${CMAKE_BINARY_DIR}/bin/llvm-dis ${CMAKE_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc) +# This puts libhpvm-rt.a in lib/ which we don't care about +# we want ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/hpvm-rt.dir/hpvm-rt.cpp.o +# which is a LLVM Bitcode file because of the -flto below. +add_llvm_library(hpvm-rt hpvm-rt.cpp DEPENDS clang) +target_compile_options(hpvm-rt PUBLIC -flto) +target_include_directories(hpvm-rt PRIVATE ${OpenCL_INCLUDE_DIRS}) +target_link_directories(hpvm-rt PUBLIC ${OpenCL_LIBRARY}) -add_dependencies(hpvm-rt.cpp.o hpvm-rt.ll) +# Move and rename hpvm-rt.cpp.o to be an actual bc code +add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc" ALL + DEPENDS hpvm-rt + COMMAND cp + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/hpvm-rt.dir/hpvm-rt.cpp.o + ${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc +) +add_custom_target(hpvm-rt.bc ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/hpvm-rt.bc") diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt index d28868892f6d45e6905594e143a13aa83b1db9d6..2f8cfc27e5280e7d18a830cc6083841a2cc3590b 100644 --- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -173,9 +173,10 @@ target_compile_definitions(tensor_runtime_install PRIVATE -DONLINE_PROFILING=fal # Online version cuda_add_library(tensor_runtime_online ${RUNTIME_SRCS}) -cuda_add_cublas_to_target(tensor_runtime_online) +cuda_add_cublas_to_target(tensor_runtime_online ${OpenMP_CXX_FLAGS}) +target_compile_options(tensor_runtime_online PRIVATE ${OpenMP_CXX_FLAGS}) add_dependencies(tensor_runtime_online tensor_runtime) -target_link_libraries(tensor_runtime_online ${LINK_LIBS}) +target_link_libraries(tensor_runtime_online ${LINK_LIBS} ${OpenMP_CXX_FLAGS}) target_compile_definitions(tensor_runtime_online PRIVATE -DONLINE_PROFILING=true -DFP16_tuning=false) diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h index 31214eaaa799d5cd8d4de5b0935b41aa7fce617d..4bb703bbd2596980fb4d930b36aaa749c7144044 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -1,3 +1,14 @@ +//===--------------------------- tensor_cpu_runtime.h -----------------------===// +// +//===----------------------------------------------------------------------===// +// +// This header file comprises of the API to the tensor routines for CPU. +// This also contains the interfaces to the approximated versions of tensor +// operations that are supported on CPU. +// +//===----------------------------------------------------------------------===// + + #include <stdio.h> #include <cstdlib> #include <cmath> diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc index 7eb3afc62d9e24cb9f73720a2a522e42004fff57..323adbac8940ed83c51d3729565c1bda3dbf35cc 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc @@ -1,3 +1,13 @@ +//===--------------------------- tensor_signatures.cc -----------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the API to the HPVM tensor runtime. +// This is compiled to LLVM bitcode file that is loaded by HPVM passes when +// tensor-based application are compiled through HPVM. +// +//===----------------------------------------------------------------------===// + #include "tensor_runtime.h" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu index 9a3c9ca848d443a20f1dcbb98fb3eda52ee15945..e9a4e50b000918c328a8b693f39c04505b6e4b79 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu @@ -2,7 +2,7 @@ // //===----------------------------------------------------------------------===// // -// This file consists of the simulation of implementation of software +// This file consists of the emulations of implementation of software // approximations for tensor convolutions. The approximations implemented are // feature sampling and perforation for FP32 and FP16 compute precisions. // diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu index 1b770736bab93dd6a47cb4351dd0ad054e8eb14d..c1848f126750808a9438a4d2cf7729d1bf420fd1 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu @@ -1031,7 +1031,10 @@ __global__ void convToGemmApprox(float * const __restrict__ output, } - +/// This function serves as an API with the custom implementation of convolution +/// with the perforation and filter sampling support. The compute precison is FP32. +/// This routine is invoked by the tuner for tuning approximations for convolutions. +/// void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, @@ -1245,6 +1248,9 @@ void switchMatrixFull(int N, int n, int c, int h, int w, } +/// This function serves as an API with the custom implementation of convolution +/// with the perforation and filter sampling support. The compute precison is FP32. +/// void* tensorConvApprox(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups, @@ -1528,6 +1534,10 @@ void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __hal } +/// This function serves as an API to custom implementation of the +/// half-precision convolution with the perforation and filter sampling +/// support. +/// void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp index d9d598d2a64cd898bc6c2b51607e1fb92b9afb8a..7b8865966f03d838b2de1fe06339c4a8620870b1 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp @@ -1,3 +1,13 @@ +//===--------------------------- configuration.cpp -------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file consists of the definitions of API to get information about +// configurations for rest of the tensor runtime to use. +// +//===----------------------------------------------------------------------===// + + #include "configuration.h" using P_APPROX = PROMISENodeConfiguration::APPROX; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu index 1a42568784cc177783b2290a14c46068815e1632..4392839f7f6dbca8df4352a19fdd689d6f8e3d5e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu @@ -1,4 +1,12 @@ - +//===--------------------------- fp16_gemm.cu -----------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file consists of the custom implementation of quantization kernels. +// This helps HPVM to switch compute precision for tensor operations between +// FP32 and FP16. +// +//===----------------------------------------------------------------------===// #ifndef FP16_UTILS_HEADER #define FP16_UTILS_HEADER diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu index fd8a23b9cad89fe9ac6618e8c1b0e962ab27cf15..4b49a3702b1938ceed9829cc3572474c7cb82420 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu @@ -1,3 +1,13 @@ +//===--------------------------- group_conv.cu -----------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file group convolutions with FP16 and FP32 compute precisions. +// Note that group convolutions, unlike regular convolutions, are not +// approximable in any other way in HPVM. +// +//===----------------------------------------------------------------------===// + #include "tensor_utils.h" #include "fp16_gemm.h" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu index f24e8b58dbeb5a49e0eaf51cfac1f2d2f3148caa..e706080051a41dac1f7486027fcb9225793921bf 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu @@ -1,4 +1,13 @@ - +//===--------------------------- half_precision_api.cu --------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file consists of the custom implementation of tensor precision changing +// kernels useful for approximated and non-approximated versions of tensor +// operations. This file also contains API for tensor operations operating on +// tensors with half-precision. +// +//===----------------------------------------------------------------------===// #ifndef HALF_API_HEADER #define HALF_API_HEADER diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index 339c0ebd2f94a0a15c1fa461cebf56fd237be7dd..5b0f0beedb4a13bbe484175ade0e2f5364e7be13 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -1,3 +1,14 @@ +//===--------------------------- hpvm-rt-controller.cpp ---------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file contains code for that allows the tensor runtime to adapt +// in response to external changes in conditions (such as frequency changes) +// by helping to choose correct approximation configurations. It also provides +// routines for the rest of the runtime to get performance and energy profiling. +// +//===----------------------------------------------------------------------===// + #include "hpvm-rt-controller.h" #include "img_tensor_utils.h" @@ -1531,6 +1542,9 @@ hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { return &labels_from_file[start]; } +static float average_accuracy = 0.0; +static int num_executations = 0; + //*** Copied from dnn_sources/include/utils.h ***// float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { @@ -1557,12 +1571,16 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); + + average_accuracy = accuracy + (average_accuracy * num_executations); + num_executations++; + average_accuracy = average_accuracy/num_executations; FILE *fp = fopen("final_accuracy", "w+"); if (fp != NULL) { std::ostringstream ss; - ss << std::fixed << accuracy; + ss << std::fixed << average_accuracy; std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc index 8683cbb416428f4691a10d2d9cd57a7252421899..ad1d2e137d19d1c158afb031f35f278d9cdefaa0 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc @@ -1,3 +1,12 @@ +//===----------------------------- profling.cc ---------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file contains code provides the definition of the interface for +// applications to start and stop profiling for energy and performance. +// +//===----------------------------------------------------------------------===// + #ifndef PROFILING_HEADER #define PROFILING_HEADER @@ -30,7 +39,6 @@ void startProfiling() { } void stopProfiling() { - FILE *fp = fopen("profile_data.txt", "w+"); if (fp != NULL) { fwrite(profile_data.c_str(), 1, profile_data.length(), fp); @@ -44,7 +52,7 @@ void stopProfiling() { void profileEvent(const char *event_name, bool compare_previous = false) { checkCudaErrors(cudaDeviceSynchronize()); - + auto it = func_counters.find(event_name); if (it == func_counters.end()) { func_counters[event_name] = 1; @@ -85,6 +93,7 @@ void profileEvent(const char *event_name, bool compare_previous = false) { previous_time = time_reading; // set the previous time reading to the current // profiled time + } } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index 2bc62057b5c13161475b50b4a750da49146b97ce..079a9898294b01ba8dfcb575f11998790f24abfa 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -1,3 +1,13 @@ +//===--------------------------- tensor_utils.cu --------------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file consists of the custom implementation of utility functions +// useful for approximated and non-approximated versions of tensor operations. +// +//===----------------------------------------------------------------------===// + + #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -76,7 +86,6 @@ int getTypeSize(int data_type){ } static int getFullPrecTypeSize(int data_type){ - // TODO: Add support for more data types switch (data_type) { case float_type: case half_type: @@ -139,7 +148,9 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ //host_ptr.push_back(tensor->host_data); } - +/// Two tensor formats are supported: NCHW and NHWC. +/// TODO: Make this more general in the future. +/// void setCudnnDataFormat(struct Tensor* tensor, int data_format){ switch(data_format){ @@ -258,7 +269,7 @@ void setTensorDescriptor(struct Tensor* tensor, int num_dims, } - +/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size){ @@ -426,12 +437,8 @@ void setTensorDescriptor(struct Tensor* tensor, int num_dims, if(tensor == NULL) return; - //printf("**** cur_type = %d , half_type = %d \n", tensor->cur_type, half_type); - - if (ONLINE_PROFILING){ - if (tensor->cur_type == half_type) - return; - } + if (tensor->cur_type == half_type) + return; DEBUG("ConvertoFP16 \n"); @@ -487,11 +494,10 @@ void convertToFP32_offline(struct Tensor* tensor){ if(tensor == NULL) return; - - if(ONLINE_PROFILING){ + + if (tensor->cur_type == half_type) return; - } - + DEBUG("ConvertoFP32 \n"); setSizeInBytes(tensor, float_type, tensor->num_elems); @@ -528,4 +534,6 @@ void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement ERROR("Tensor == NULL"); tensor->data_placement = data_placement; } + + } // end of Extern"C" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu index f9fee629e1192ee985064a5f968376d1381d9af9..3f433be855a762028d94d3871abc4d8971507c46 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu @@ -1,3 +1,12 @@ +//===--------------------------- wrapper_runtime.cu -----------------------===// +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of some of the core API to tensor runtime +// so that runtime tuning of approximations can be done on different targets. +// +//===----------------------------------------------------------------------===// + #include <stdio.h> #include <cstdio> diff --git a/hpvm/scripts/download_weights.sh b/hpvm/scripts/download_weights.sh new file mode 100755 index 0000000000000000000000000000000000000000..757abbf3b6f442e729fc100dad73605511e0454f --- /dev/null +++ b/hpvm/scripts/download_weights.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# First get hands on gdown -- google drive downloader +wget https://raw.githubusercontent.com/circulosmeos/gdown.pl/master/gdown.pl -O gdown.pl +chmod +x ./gdown.pl +# Download the zip file from google drive +./gdown.pl 'https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u' model_params.zip +unzip model_params.zip # should give a "model_params" folder +# All our benchmarks also know to look for parameters in <build_dir>/model_params. +# Cleanup: +rm gdown.pl model_params.zip \ No newline at end of file diff --git a/hpvm/scripts/llvm_installer.sh b/hpvm/scripts/llvm_installer.sh index 21ed6ee6d13ef83e0cc62f643d8e674e7c0e5a90..0cbaea8e493de8a08833ca5ab025e2692f609fe5 100755 --- a/hpvm/scripts/llvm_installer.sh +++ b/hpvm/scripts/llvm_installer.sh @@ -256,17 +256,7 @@ if [ $DOWNLOAD_WEIGHTS == "y" ]; then echo echo "Downloading weights for DNN benchmarks..." echo - - # First get hands on gdown -- google drive downloader - wget https://raw.githubusercontent.com/circulosmeos/gdown.pl/master/gdown.pl -O gdown.pl - chmod +x ./gdown.pl - # Download the zip file from google drive - ./gdown.pl 'https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u' model_params.zip - unzip model_params.zip # should give a "model_params" folder - mv model_params $BUILD_DIR - # All our benchmarks also know to look for parameters in <build_dir>/model_params. - # Cleanup: - rm gdown.pl model_params.zip + ../scripts/download_weights.sh else echo "Skipping weight download" fi diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt index 887b2d1e6c3003cf886a907bcaf51c830dd0e423..3b78ad26df6ab435978b7d0f171fb654430ca324 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt @@ -40,10 +40,10 @@ list( ) # The hpvm-rt runtime -# This has to be explicitly set as hpvm-rt.ll is created in a custom_target +# This has to be explicitly set as hpvm-rt.bc is created in a custom_target # and does not export its file location. # Keep this in sync with hpvm/projects/hpvm-rt/CMakeLists.txt. -set(HPVM_RT_PATH ${PROJECT_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.ll) +set(HPVM_RT_PATH ${PROJECT_BINARY_DIR}/tools/hpvm/projects/hpvm-rt/hpvm-rt.bc) # Compile flags (clang++) set(CLANG_FLAGS -fno-exceptions -std=c++11 -O3) @@ -89,7 +89,7 @@ function(compile_single_benchmark target src_file extra_passes extra_dfg_flags) ) add_custom_command( OUTPUT "${WORK_DIR}/${target}.linked.bc" - DEPENDS "${WORK_DIR}/${target}.llvm.ll" hpvm-rt.ll llvm-link + DEPENDS "${WORK_DIR}/${target}.llvm.ll" hpvm-rt.bc llvm-link COMMAND ${LLVM_LINK} ${WORK_DIR}/${target}.llvm.ll ${HPVM_RT_PATH} -o ${WORK_DIR}/${target}.linked.bc ) @@ -106,12 +106,19 @@ function(compile_single_benchmark target src_file extra_passes extra_dfg_flags) set(test_compile_targets ${test_compile_targets} ${target} PARENT_SCOPE) endfunction(compile_single_benchmark) +set(test_run_targets "") +function(run_single_benchmark run_target benchmark) + add_custom_target( + ${run_target} + COMMAND ${WORK_DIR}/${benchmark} + ) + add_dependencies(${run_target} ${benchmark}) + set(test_run_targets ${test_run_targets} ${run_target} PARENT_SCOPE) +endfunction(run_single_benchmark) + file(GLOB entries ./benchmarks/*) foreach(dir ${entries}) get_filename_component(dirname "${dir}" NAME) - compile_single_benchmark( - "test_${dirname}" ${dir}/${dirname}.cpp LLVMDFG2LLVM_CUDNN -dfg2llvm-cudnn - ) set( loop_extra_flags -dfg2llvm-wrapperapi @@ -119,9 +126,14 @@ foreach(dir ${entries}) -configuration-inputs-filename=${dir}/data/tuner_confs.txt ) compile_single_benchmark( - "test_${dirname}_loop" ${dir}/${dirname}_loop.cpp + ${dirname} ${dir}/${dirname}.cpp LLVMDFG2LLVM_WrapperAPI "${loop_extra_flags}" ) + run_single_benchmark(run_${dirname} ${dirname}) + compile_single_benchmark( + ${dirname}_cudnn ${dir}/${dirname}_cudnn.cpp LLVMDFG2LLVM_CUDNN -dfg2llvm-cudnn + ) + run_single_benchmark(run_${dirname}_cudnn ${dirname}_cudnn) endforeach(dir) message(STATUS "List of test dnn benchmarks: ${test_compile_targets}") add_custom_target(dnn_benchmarks DEPENDS ${test_compile_targets}) diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/data/tuner_confs.txt deleted file mode 100644 index a6d177c90d5a2890afa5387d4c2a50de1cb6c852..0000000000000000000000000000000000000000 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/data/tuner_confs.txt +++ /dev/null @@ -1,11 +0,0 @@ -2000 -+++++ -conf1 3.86 0 79.1 0.0 -1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -3 gpu conv fp32 1 add fp32 1 tanh fp32 1 -4 gpu conv fp32 1 add fp32 1 tanh fp32 1 -5 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -6 gpu mul fp32 1 add fp32 1 -7 gpu softmax fp32 1 ------ diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/data/tuner_confs.txt deleted file mode 100644 index 9d6f975869964e8bb666262923172eac42a43151..0000000000000000000000000000000000000000 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/data/tuner_confs.txt +++ /dev/null @@ -1,12 +0,0 @@ -2000 -+++++ -conf1 2.64294896823 0 84.24999995 -0.05999995000000524 -1 gpu conv fp32 1 add fp32 1 tanh fp32 1 -2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -3 gpu conv fp32 1 add fp32 1 tanh fp32 1 -4 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -5 gpu conv fp32 1 add fp32 1 tanh fp32 1 -6 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -7 gpu mul fp32 1 add fp32 1 -8 gpu softmax fp32 1 ------ diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp index 59161a118d6e9baa9196d045a072993c733b3697..42f8756f7f0feae838caa652406017c9389a21c4 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10.cpp @@ -415,7 +415,7 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/"; - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); @@ -497,7 +497,7 @@ int main() { int test_input_size = 5000; int batch_count = test_input_size / batch_size; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); // void* input = create4DTensor(0,nchw,batch_size,3,32,32); startMemTracking(); diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp index bc1f9fa18e6faeed60d171ec90c4dc891136b1ad..569793db2a60af48327bf6a6328f64104b55a3e1 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/alexnet2.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/alexnet2_cifar10_cudnn.cpp @@ -415,7 +415,7 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet2_cifar10/"; - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); @@ -494,7 +494,7 @@ int main() { int test_input_size = 10000; int batch_count = test_input_size / batch_size; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); void *input = create4DTensor(0, nchw, batch_size, 3, 32, 32); startMemTracking(); diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/data/quant_ranges_rt.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/data/quant_ranges_rt.txt similarity index 100% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2/data/quant_ranges_rt.txt rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/data/quant_ranges_rt.txt diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/data/tuner_confs.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec4a06d3dbd2e088d6db287d23dd3bd5aad7ddb --- /dev/null +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet2_cifar10/data/tuner_confs.txt @@ -0,0 +1,419 @@ +1114.3009809999999 ++++++ +conf1 1 1 84.98 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 +6 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +7 gpu mul fp32 11 add fp32 1 +8 gpu softmax fp32 1 +----- ++++++ +conf2 2.4248748377353113 2.0815908534183163 84.5 0.480000000000004 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf3 2.4055188425519614 2.0586265720811823 84.48 0.5 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 269 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf4 2.4156140842962985 2.0617867479342706 84.28 0.7000000000000028 +1 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf5 2.396416918342732 2.0506214971794585 84.02 0.960000000000008 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf6 2.463002582910052 2.1171077568609458 83.84 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf7 2.360283215266004 2.0255245321874304 83.78 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf8 2.4140791541736157 2.0671513522247653 83.74000000000001 1.2399999999999949 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf9 2.457753689612079 2.1086250651240137 83.7 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf10 2.459170454055443 2.1111925341396343 83.7 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 164 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf11 2.4135986141645764 2.060453960420927 83.62 1.3599999999999994 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf12 2.4631278039012106 2.1092094797926637 83.58 1.4000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf13 2.535761391794481 2.16998336112692 83.58 1.4000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf14 2.289006193945062 1.961240158652051 83.54 1.4399999999999977 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf15 2.4257674844112573 2.0808440756495563 83.5 1.480000000000004 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 161 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf16 2.458122368488622 2.109531159729078 83.48 1.5 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf17 2.281072202152105 1.9539314420536427 83.46000000000001 1.519999999999996 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf18 2.4572171342078444 2.1088933553775697 83.46000000000001 1.519999999999996 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf19 2.3017607719030058 1.9782265708150768 83.42 1.5600000000000023 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf20 2.379206814483014 2.047909200292713 83.39999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 151 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf21 2.4636282705302537 2.1162281156388527 83.39999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf22 2.461590101374146 2.1108493881199184 83.22 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 161 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf23 2.537054645442804 2.167568834938183 83.22 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf24 2.4631604723407885 2.1099694757102845 83.17999999999999 1.8000000000000114 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf25 2.4636282705302537 2.1162281156388527 83.14 1.8400000000000034 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf26 2.462588899729088 2.109477918791931 83.14 1.8400000000000034 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf27 2.4638085754689025 2.1071960926343603 83.1 1.8800000000000097 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf28 2.4640079766123635 2.110326453157297 83.08 1.9000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf29 2.459337622764853 2.107249218450713 83.06 1.9200000000000017 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf30 2.538176340059405 2.173287257415721 83.02000000000001 1.9599999999999937 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 164 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf31 2.3905426931959846 2.044333576277581 83.02000000000001 1.9599999999999937 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf32 2.459337622764853 2.107249218450713 83.0 1.980000000000004 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf33 2.458968579288317 2.1063450826631396 82.89999999999999 2.0800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf34 2.2912974651603877 1.9670210508860688 82.8 2.180000000000007 +1 gpu conv perf_fp16 168 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf35 2.4648489763056327 2.113931670664391 82.66 2.3200000000000074 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf36 2.4599076869402854 2.1077397371200193 82.6 2.3800000000000097 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf37 2.4636282705302537 2.1162281156388527 82.54 2.4399999999999977 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf38 2.591814267389778 2.222680944458784 82.26 2.719999999999999 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp index 86b3e7eb93bb6040af97007741853ef6474ddb3d..e5edc8a5890cdbf51bba1ed0effdf64b2297d29a 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10.cpp @@ -366,9 +366,9 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); uint8_t *labels = readLabels(labels_path.c_str(), 5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp similarity index 98% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp index 4dcd57c8164c8bd73280d6224c44bb8b9ec9d6f0..7ce160881372f9b09e20f079ba5b065f724fe34f 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/alexnet.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/alexnet_cifar10_cudnn.cpp @@ -367,9 +367,9 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); uint32_t *labels = readLabels3(labels_path.c_str(), 5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/data/quant_ranges_rt.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/data/quant_ranges_rt.txt similarity index 100% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet/data/quant_ranges_rt.txt rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/data/quant_ranges_rt.txt diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/data/tuner_confs.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9ccba6eb63f620c0e3b6f95fd7c50892018f00f --- /dev/null +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_cifar10/data/tuner_confs.txt @@ -0,0 +1,511 @@ +2592.187221 ++++++ +conf1 1 1 79.28 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +6 gpu mul fp32 11 add fp32 1 +7 gpu softmax fp32 1 +----- ++++++ +conf2 1.7593976485873195 1.6193399031642917 79.23 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf3 2.092625440752526 1.9139078015388271 78.96 0.3200000000000074 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf4 1.8870195448805414 1.7296919053025768 78.8 0.480000000000004 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf5 2.1184804041774554 1.9598989563949536 78.75999999999999 0.5200000000000102 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf6 2.1184804041774554 1.9598989563949536 78.75999999999999 0.5200000000000102 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf7 2.0933825381386364 1.9150743378318535 78.64 0.6400000000000006 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf8 2.081712090729918 1.9102226906341664 78.5 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf9 2.081712090729918 1.9102226906341664 78.5 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf10 2.2662606588487595 2.066560750795139 78.48 0.7999999999999972 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf11 2.121684761285686 1.966318179285323 78.48 0.7999999999999972 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf12 2.3417491169395532 2.1355030360671465 78.38000000000001 0.8999999999999915 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf13 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf14 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf15 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf16 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf17 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf18 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf19 2.146571989407323 1.95711703610764 78.18 1.0999999999999943 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf20 2.303316973793268 2.1036463961913276 78.10000000000001 1.1799999999999926 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf21 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf22 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf23 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf24 2.1106508925330925 1.9419233584234938 78.06 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf25 2.3203534290038634 2.116965679235447 78.06 1.2199999999999989 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf26 2.3527290658539215 2.145832257234814 78.03999999999999 1.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf27 2.3527290658539215 2.145832257234814 78.03999999999999 1.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf28 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf29 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf30 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf31 2.3137982135449207 2.1281257317083417 77.84 1.4399999999999977 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 265 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf32 2.1198074418988333 1.9522214255218437 77.82 1.460000000000008 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf33 2.246924974355375 2.065289762405701 77.8 1.480000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 269 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf34 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf35 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf36 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf37 2.5289288699015304 2.334007588396142 77.72 1.5600000000000023 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf38 2.5289288699015304 2.334007588396142 77.72 1.5600000000000023 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf39 2.3117594882585775 2.1152397180868943 77.56 1.7199999999999989 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf40 2.452732477854469 2.264573687601476 77.56 1.7199999999999989 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf41 2.452732477854469 2.264573687601476 77.56 1.7199999999999989 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf42 2.382518688546389 2.178614303992064 77.5 1.7800000000000011 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf43 2.382518688546389 2.178614303992064 77.5 1.7800000000000011 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf44 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf45 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf46 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf47 2.4835281673276515 2.279527076032239 77.3 1.980000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf48 2.4835281673276515 2.279527076032239 77.3 1.980000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf49 2.1553694968551302 1.9959124044028933 77.18 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 265 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf50 2.5877520959724816 2.3763616521050364 77.03999999999999 2.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf51 2.5877520959724816 2.3763616521050364 77.03999999999999 2.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp index 466e311577d1e1d46d2e0c6a2a624cc21900be4f..24f4d1520a0fe42b20149baac6d0ca3c4c8d6ba0 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 4, 4); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_10_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_20_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -464,8 +464,9 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); @@ -514,16 +515,16 @@ int main() { std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); void *dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 1000, 3, 224, 224); + // void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224); // uint32_t* labels = readLabels2(labels_path.c_str(),6000); - uint32_t *labels = readLabels3(labels_path.c_str(), 1000); + // uint32_t* labels = readLabels3(labels_path.c_str(), 1000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; + // args->input = input; + // args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -557,14 +558,40 @@ int main() { args->dense_3_b = dense_3_b; args->dense_3_b_bytes = 0; - void *dfg = __hpvm__launch(0, root, (void *)args); + int batch_size = 100; + int test_input_size = 4000; + int batch_count = test_input_size / batch_size; - __hpvm__wait(dfg); + startMemTracking(); + startProfiling(); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + for (int j = 0; j < 1; j++) { + for (int i = 0; i < batch_count; i++) { + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + args->input = input; + args->input_bytes = 0; + + void *dfg = __hpvm__launch(0, root, (void *)args); + + __hpvm__wait(dfg); + + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + + freeBatchMemory(); + } + } + + stopProfiling(); __hpvm__cleanup(); - computeAccuracy3(labels, result); + return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp similarity index 88% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp index 340e0aa1194ac57e96eadd1669a97fa25fdd0c44..73175982ab98c19efdf1e77b6e2db504af4d6d93 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/alexnet_imagenet_cudnn.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 4, 4); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_10_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_20_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -464,9 +464,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/alexnet_imagenet/"; - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); @@ -515,16 +514,16 @@ int main() { std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); void *dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); - // void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224); + void *input = readTrainedWeights(input_path.c_str(), 0, 1000, 3, 224, 224); // uint32_t* labels = readLabels2(labels_path.c_str(),6000); - // uint32_t* labels = readLabels3(labels_path.c_str(), 1000); + uint32_t *labels = readLabels3(labels_path.c_str(), 1000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - // args->input = input; - // args->input_bytes = 0; + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -558,40 +557,14 @@ int main() { args->dense_3_b = dense_3_b; args->dense_3_b_bytes = 0; - int batch_size = 100; - int test_input_size = 4000; - int batch_count = test_input_size / batch_size; + void *dfg = __hpvm__launch(0, root, (void *)args); - startMemTracking(); - startProfiling(); + __hpvm__wait(dfg); - for (int j = 0; j < 1; j++) { - for (int i = 0; i < batch_count; i++) { + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void *input = - readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); - - args->input = input; - args->input_bytes = 0; - - void *dfg = __hpvm__launch(0, root, (void *)args); - - __hpvm__wait(dfg); - - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); - - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - - freeBatchMemory(); - } - } - - stopProfiling(); __hpvm__cleanup(); - + computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/data/tuner_confs.txt index 377bc6a5628a5f869ccab9723838622afcbb210c..b0e42a5aaa5d7b5a06b6422a5c33a0047b6eff8d 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/alexnet_imagenet/data/tuner_confs.txt @@ -1,13 +1,229 @@ -750.80768325 -+++++ -conf1 1.0 0 79.1 0.0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -3 gpu conv fp32 1 add fp32 1 relu fp32 1 -4 gpu conv fp32 1 add fp32 1 relu fp32 1 -5 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -6 gpu mul fp32 1 add fp32 1 relu fp32 1 -7 gpu mul fp32 1 add fp32 1 relu fp32 1 -8 gpu mul fp32 1 add fp32 1 -9 gpu softmax fp32 1 +2739.950736 ++++++ +conf1 1 1 56.3 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +6 gpu mul fp32 11 add fp32 1 relu fp32 1 +7 gpu mul fp32 11 add fp32 1 relu fp32 1 +8 gpu mul fp32 11 add fp32 1 +9 gpu softmax fp32 1 +----- ++++++ +conf2 1.802133644103582 1.8186433204507424 55.76 0.5399999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf3 1.7574572103878898 1.7673706184460103 55.58 0.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf4 2.0227701930718065 2.043112495268932 55.42 0.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf5 1.9872634777043927 2.002789650227035 55.120000000000005 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf6 1.8204253918445088 1.843736069756362 54.84 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf7 1.9308336510645352 1.934889049414224 54.74 1.5599999999999952 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf8 2.0146435217865446 2.0367475358800102 54.58 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf9 2.0101709494490696 2.0329911158023064 54.400000000000006 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf10 2.0052132441967916 2.0284931705407003 54.300000000000004 1.999999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf11 2.010827434817262 2.036001862538864 54.2 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf12 2.019868378233057 2.0433540129730265 54.17999999999999 2.1200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf13 1.9923471030291253 2.009177323959059 54.120000000000005 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf14 1.9923471030291253 2.009177323959059 54.120000000000005 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf15 2.028037341700216 2.049760395549724 54.0 2.299999999999997 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf16 1.9910730364852436 2.006510848093771 53.54 2.759999999999998 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf17 2.1567475543719614 2.159142310265706 53.300000000000004 2.999999999999993 +1 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf18 2.1567475543719614 2.159142310265706 53.300000000000004 2.999999999999993 +1 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf19 2.0232690820426464 2.0527698121318476 53.300000000000004 2.999999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 11 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 ----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt index f2a85f352fe024f0fcf7828c259f8549f6461e24..b4e51dff426f4d3c5cb7b9572e6aa5940212acbd 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/data/tuner_confs.txt @@ -1,9 +1,409 @@ -2000 +282.5141369999999 +++++ -conf1 1 0 99.69 0 -1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 -3 gpu mul fp32 1 add fp32 1 tanh fp32 1 -4 gpu mul fp32 1 add fp32 1 tanh fp32 1 +conf1 1 1 98.7 0.0 +1 gpu conv fp32 11 add fp32 1 pool_max fp32 1 tanh fp32 1 +2 gpu conv fp32 11 add fp32 1 pool_max fp32 1 tanh fp32 1 +3 gpu mul fp32 11 add fp32 1 tanh fp32 1 +4 gpu mul fp32 11 add fp32 1 tanh fp32 1 +5 gpu softmax fp32 1 +----- ++++++ +conf2 1.828613181003043 2.071721708828981 98.65 0.04999999999999716 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf3 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf4 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf5 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf6 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf7 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf8 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf9 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf10 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf11 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf12 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf13 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf14 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf15 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf16 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf17 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf18 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf19 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf20 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf21 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf22 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf23 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf24 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf25 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf26 2.200653361151419 2.425091789360736 98.6 0.10000000000000853 +1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf27 2.200653361151419 2.425091789360736 98.6 0.10000000000000853 +1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf28 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf29 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf30 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf31 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf32 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf33 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf34 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf35 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf36 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf37 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf38 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf39 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf40 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf41 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf42 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf43 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf44 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf45 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf46 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf47 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf48 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf49 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf50 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf51 1.8534621507951072 2.1231113105788597 98.44000000000001 0.2599999999999909 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 5 gpu softmax fp32 1 ----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp index 3613e9f1325d73e7515a88f3e198bcd32821224c..a20315cb9c36610aac2d0d43059182302674b83b 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_10_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -268,8 +268,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = @@ -294,15 +294,15 @@ int main() { readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10); std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 1, 28, 28); + // void* input = readTrainedWeights(input_path.c_str(), 0, 5000,1,28,28); - uint32_t *labels = readLabels3(labels_path.c_str(), 5000); + // uint32_t* labels = readLabels3(labels_path.c_str(), 5000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; + // args->input = input; + // args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -320,15 +320,37 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; - void *dfg = __hpvm__launch(0, root, (void *)args); + int batch_size = 500; + int test_input_size = 5000; + int batch_count = test_input_size / batch_size; + + startMemTracking(); + startProfiling(); + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 1, 28, 28); + + args->input = input; + args->input_bytes = 0; + + void *dfg = __hpvm__launch(0, root, (void *)args); + + __hpvm__wait(dfg); + + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - __hpvm__wait(dfg); + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + freeBatchMemory(); + } + stopProfiling(); __hpvm__cleanup(); - computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp similarity index 85% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp index 9a8bfbc68fcaad4b369223c53e98121e9934b27d..4e0adc7bbe15356955a178d8db30466c8b872258 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/lenet_mnist/lenet_mnist_cudnn.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 2, 2, 1, 1); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_10_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_tanh(t1); @@ -268,8 +268,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/lenet_mnist/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = @@ -294,15 +294,15 @@ int main() { readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 1024, 10); std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); - // void* input = readTrainedWeights(input_path.c_str(), 0, 5000,1,28,28); + void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 1, 28, 28); - // uint32_t* labels = readLabels3(labels_path.c_str(), 5000); + uint32_t *labels = readLabels3(labels_path.c_str(), 5000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - // args->input = input; - // args->input_bytes = 0; + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -320,37 +320,15 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; - int batch_size = 500; - int test_input_size = 5000; - int batch_count = test_input_size / batch_size; - - startMemTracking(); - startProfiling(); - - for (int i = 0; i < batch_count; i++) { - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void *input = readInputBatch(input_path.c_str(), 0, start, end, 1, 28, 28); - - args->input = input; - args->input_bytes = 0; - - void *dfg = __hpvm__launch(0, root, (void *)args); - - __hpvm__wait(dfg); - - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + void *dfg = __hpvm__launch(0, root, (void *)args); - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + __hpvm__wait(dfg); - freeBatchMemory(); - } + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - stopProfiling(); __hpvm__cleanup(); + computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/data/tuner_confs.txt deleted file mode 100644 index ed02ddab0dbef2b21f785226b80f4eee7a1735cf..0000000000000000000000000000000000000000 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/data/tuner_confs.txt +++ /dev/null @@ -1,175 +0,0 @@ -1000 -+++++ -conf1 1 0 84.8 0 -1 gpu conv fp32 1 -2 gpu batchnorm fp32 1 -3 gpu relu fp32 1 -4 gpu group_conv fp32 1 -5 gpu batchnorm fp32 1 -6 gpu relu fp32 1 -7 gpu conv fp32 1 -8 gpu batchnorm fp32 1 -9 gpu relu fp32 1 -10 gpu group_conv fp32 1 -11 gpu batchnorm fp32 1 -12 gpu relu fp32 1 -13 gpu conv fp32 1 -14 gpu batchnorm fp32 1 -15 gpu relu fp32 1 -16 gpu group_conv fp32 1 -17 gpu batchnorm fp32 1 -18 gpu relu fp32 1 -19 gpu conv fp32 1 -20 gpu batchnorm fp32 1 -21 gpu relu fp32 1 -22 gpu group_conv fp32 1 -23 gpu batchnorm fp32 1 -24 gpu relu fp32 1 -25 gpu conv fp32 1 -26 gpu batchnorm fp32 1 -27 gpu relu fp32 1 -28 gpu group_conv fp32 1 -29 gpu batchnorm fp32 1 -30 gpu relu fp32 1 -31 gpu conv fp32 1 -32 gpu batchnorm fp32 1 -33 gpu relu fp32 1 -34 gpu group_conv fp32 1 -35 gpu batchnorm fp32 1 -36 gpu relu fp32 1 -37 gpu conv fp32 1 -38 gpu batchnorm fp32 1 -39 gpu relu fp32 1 -40 gpu group_conv fp32 1 -41 gpu batchnorm fp32 1 -42 gpu relu fp32 1 -43 gpu conv fp32 1 -44 gpu batchnorm fp32 1 -45 gpu relu fp32 1 -46 gpu group_conv fp32 1 -47 gpu batchnorm fp32 1 -48 gpu relu fp32 1 -49 gpu conv fp32 1 -50 gpu batchnorm fp32 1 -51 gpu relu fp32 1 -52 gpu group_conv fp32 1 -53 gpu batchnorm fp32 1 -54 gpu relu fp32 1 -55 gpu conv fp32 1 -56 gpu batchnorm fp32 1 -57 gpu relu fp32 1 -58 gpu group_conv fp32 1 -59 gpu batchnorm fp32 1 -60 gpu relu fp32 1 -61 gpu conv fp32 1 -62 gpu batchnorm fp32 1 -63 gpu relu fp32 1 -64 gpu group_conv fp32 1 -65 gpu batchnorm fp32 1 -66 gpu relu fp32 1 -67 gpu conv fp32 1 -68 gpu batchnorm fp32 1 -69 gpu relu fp32 1 -70 gpu group_conv fp32 1 -71 gpu batchnorm fp32 1 -72 gpu relu fp32 1 -73 gpu conv fp32 1 -74 gpu batchnorm fp32 1 -75 gpu relu fp32 1 -76 gpu group_conv fp32 1 -77 gpu batchnorm fp32 1 -78 gpu relu fp32 1 -79 gpu conv fp32 1 -80 gpu batchnorm fp32 1 -81 gpu relu fp32 1 -82 gpu pool_mean fp32 1 -83 gpu mul fp32 1 add fp32 1 -84 gpu softmax fp32 1 ------ -+++++ -conf2 1.5 0 84.8 0 -1 gpu conv fp16 1 -2 gpu batchnorm fp16 1 -3 gpu relu fp16 1 -4 gpu group_conv fp16 1 -5 gpu batchnorm fp16 1 -6 gpu relu fp16 1 -7 gpu conv fp16 1 -8 gpu batchnorm fp16 1 -9 gpu relu fp16 1 -10 gpu group_conv fp16 1 -11 gpu batchnorm fp16 1 -12 gpu relu fp16 1 -13 gpu conv fp16 1 -14 gpu batchnorm fp16 1 -15 gpu relu fp16 1 -16 gpu group_conv fp16 1 -17 gpu batchnorm fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp16 1 -20 gpu batchnorm fp16 1 -21 gpu relu fp16 1 -22 gpu group_conv fp16 1 -23 gpu batchnorm fp16 1 -24 gpu relu fp16 1 -25 gpu conv fp16 1 -26 gpu batchnorm fp16 1 -27 gpu relu fp16 1 -28 gpu group_conv fp16 1 -29 gpu batchnorm fp16 1 -30 gpu relu fp16 1 -31 gpu conv fp16 1 -32 gpu batchnorm fp16 1 -33 gpu relu fp16 1 -34 gpu group_conv fp16 1 -35 gpu batchnorm fp16 1 -36 gpu relu fp16 1 -37 gpu conv fp16 1 -38 gpu batchnorm fp16 1 -39 gpu relu fp16 1 -40 gpu group_conv fp16 1 -41 gpu batchnorm fp16 1 -42 gpu relu fp16 1 -43 gpu conv fp16 1 -44 gpu batchnorm fp16 1 -45 gpu relu fp16 1 -46 gpu group_conv fp16 1 -47 gpu batchnorm fp16 1 -48 gpu relu fp16 1 -49 gpu conv fp16 1 -50 gpu batchnorm fp16 1 -51 gpu relu fp16 1 -52 gpu group_conv fp16 1 -53 gpu batchnorm fp16 1 -54 gpu relu fp16 1 -55 gpu conv fp16 1 -56 gpu batchnorm fp16 1 -57 gpu relu fp16 1 -58 gpu group_conv fp16 1 -59 gpu batchnorm fp16 1 -60 gpu relu fp16 1 -61 gpu conv fp16 1 -62 gpu batchnorm fp16 1 -63 gpu relu fp16 1 -64 gpu group_conv fp16 1 -65 gpu batchnorm fp16 1 -66 gpu relu fp16 1 -67 gpu conv fp16 1 -68 gpu batchnorm fp16 1 -69 gpu relu fp16 1 -70 gpu group_conv fp16 1 -71 gpu batchnorm fp16 1 -72 gpu relu fp16 1 -73 gpu conv fp16 1 -74 gpu batchnorm fp16 1 -75 gpu relu fp16 1 -76 gpu group_conv fp16 1 -77 gpu batchnorm fp16 1 -78 gpu relu fp16 1 -79 gpu conv fp16 1 -80 gpu batchnorm fp16 1 -81 gpu relu fp16 1 -82 gpu pool_mean fp16 1 -83 gpu mul fp16 1 add fp16 1 -84 gpu softmax fp32 1 ------ diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/data/quant_ranges_rt.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/data/quant_ranges_rt.txt similarity index 100% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/data/quant_ranges_rt.txt rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/data/quant_ranges_rt.txt diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/data/tuner_confs.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4d8bd893c8d9395fce6a3484d75f543f1e72da2 --- /dev/null +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/data/tuner_confs.txt @@ -0,0 +1,3220 @@ +4077.307063200001 ++++++ +conf1 1 1 84.42 0.0 +1 gpu conv fp32 11 +2 gpu batchnorm fp32 11 +3 gpu relu fp32 11 +4 gpu group_conv fp32 11 +5 gpu batchnorm fp32 11 +6 gpu relu fp32 11 +7 gpu conv fp32 11 +8 gpu batchnorm fp32 11 +9 gpu relu fp32 11 +10 gpu group_conv fp32 11 +11 gpu batchnorm fp32 11 +12 gpu relu fp32 11 +13 gpu conv fp32 11 +14 gpu batchnorm fp32 11 +15 gpu relu fp32 11 +16 gpu group_conv fp32 11 +17 gpu batchnorm fp32 11 +18 gpu relu fp32 11 +19 gpu conv fp32 11 +20 gpu batchnorm fp32 11 +21 gpu relu fp32 11 +22 gpu group_conv fp32 11 +23 gpu batchnorm fp32 11 +24 gpu relu fp32 11 +25 gpu conv fp32 11 +26 gpu batchnorm fp32 11 +27 gpu relu fp32 11 +28 gpu group_conv fp32 11 +29 gpu batchnorm fp32 11 +30 gpu relu fp32 11 +31 gpu conv fp32 11 +32 gpu batchnorm fp32 11 +33 gpu relu fp32 11 +34 gpu group_conv fp32 11 +35 gpu batchnorm fp32 11 +36 gpu relu fp32 11 +37 gpu conv fp32 11 +38 gpu batchnorm fp32 11 +39 gpu relu fp32 11 +40 gpu group_conv fp32 11 +41 gpu batchnorm fp32 11 +42 gpu relu fp32 11 +43 gpu conv fp32 11 +44 gpu batchnorm fp32 11 +45 gpu relu fp32 11 +46 gpu group_conv fp32 11 +47 gpu batchnorm fp32 11 +48 gpu relu fp32 11 +49 gpu conv fp32 11 +50 gpu batchnorm fp32 11 +51 gpu relu fp32 11 +52 gpu group_conv fp32 11 +53 gpu batchnorm fp32 11 +54 gpu relu fp32 11 +55 gpu conv fp32 11 +56 gpu batchnorm fp32 11 +57 gpu relu fp32 11 +58 gpu group_conv fp32 11 +59 gpu batchnorm fp32 11 +60 gpu relu fp32 11 +61 gpu conv fp32 11 +62 gpu batchnorm fp32 11 +63 gpu relu fp32 11 +64 gpu group_conv fp32 11 +65 gpu batchnorm fp32 11 +66 gpu relu fp32 11 +67 gpu conv fp32 11 +68 gpu batchnorm fp32 11 +69 gpu relu fp32 11 +70 gpu group_conv fp32 11 +71 gpu batchnorm fp32 11 +72 gpu relu fp32 11 +73 gpu conv fp32 11 +74 gpu batchnorm fp32 11 +75 gpu relu fp32 11 +76 gpu group_conv fp32 11 +77 gpu batchnorm fp32 11 +78 gpu relu fp32 11 +79 gpu conv fp32 11 +80 gpu batchnorm fp32 11 +81 gpu relu fp32 11 +82 gpu pool_mean fp32 11 +83 gpu mul fp32 11 add fp32 1 +84 gpu softmax fp32 1 +----- ++++++ +conf2 1.4930855091460031 1.447990050940341 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv fp16 12 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf3 1.493397883226807 1.449591062426989 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 163 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf4 1.4934429016801338 1.4500582352111675 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 168 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf5 1.4938214813031556 1.450038222978811 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 157 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf6 1.4933879828131855 1.449975636202813 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 160 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf7 1.492663093331302 1.4487067754520524 83.7 0.7199999999999989 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 167 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf8 1.495724395088184 1.4507925552157772 83.56 0.8599999999999994 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 162 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf9 1.496506307637598 1.4521705950285135 83.36 1.0600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 162 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf10 1.496532672928805 1.4521696542076958 83.36 1.0600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 156 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf11 1.4988418058849937 1.4555327556053628 83.28 1.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 164 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf12 1.4994289979945077 1.4562439330251535 83.28 1.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf13 1.4952028793065038 1.450369851058777 83.14 1.2800000000000011 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 162 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf14 1.4933978285280285 1.448265686258097 83.12 1.2999999999999972 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf15 1.491958833559989 1.4459262032919467 83.08 1.3400000000000034 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 157 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf16 1.4937317297990984 1.4498121856525021 83.02000000000001 1.3999999999999915 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf17 1.4963413808686974 1.4522391736954623 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 165 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf18 1.4942172827099065 1.4504631324933321 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 157 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf19 1.4963964073376739 1.4525461321361477 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf20 1.4932583049858652 1.4472547227714012 82.84 1.5799999999999983 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv samp_fp16 266 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf21 1.4964326545281064 1.4526263046333605 82.82000000000001 1.5999999999999943 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf22 1.4966042483929347 1.4527859961226985 82.82000000000001 1.5999999999999943 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf23 1.4966008974318024 1.4527415844509437 82.78 1.6400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf24 1.4932738366973777 1.448820445466833 82.64 1.7800000000000011 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 164 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 157 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf25 1.4940402684133964 1.447332235394843 82.48 1.9399999999999977 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv samp_fp16 261 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf26 1.4981764588414919 1.4530714150549078 82.39999999999999 2.0200000000000102 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 161 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf27 1.5004334658773033 1.4549115105608688 82.3 2.1200000000000045 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 156 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf28 1.5006808163336343 1.4553824345285296 82.3 2.1200000000000045 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf29 1.4999870719460484 1.4571625511374704 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 165 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf30 1.500042366879961 1.4574715946270216 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf31 1.500214789632402 1.4576323532660131 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf32 1.4927009086066445 1.4484049211953174 82.26 2.1599999999999966 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 164 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 161 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf33 1.5003438014588875 1.4538240352408085 82.22 2.200000000000003 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf34 1.5041587978616728 1.4610492456195174 82.02000000000001 2.3999999999999915 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 161 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf35 1.5000040131742656 1.4555601139156464 81.88 2.5400000000000063 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 167 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf36 1.4950571524902583 1.451478376045808 81.84 2.5799999999999983 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 164 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv perf_fp16 161 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 161 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf37 1.4975271575548847 1.4532126224638244 81.44 2.980000000000004 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 164 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 11 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp index 047697767d9fa0d7f428a02eeb6b8a9566597137..5ea5c298bf7b5858af024aff7a4ee81c4b8a6ed2 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10.cpp @@ -1966,10 +1966,10 @@ typedef struct __attribute__((__packed__)) { } RootIn; int main() { - std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet/"; + std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels32.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp index b32dccabc2f29b54e8da35551f8d982cd13a378c..b7e0a714590418414a2647474526a1fb0c09e390 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet/mobilenet.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/mobilenet_cifar10/mobilenet_cifar10_cudnn.cpp @@ -1967,10 +1967,10 @@ typedef struct __attribute__((__packed__)) { int main() { - std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet/"; + std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/mobilenet_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/data/tuner_confs.txt deleted file mode 100644 index 3a414afad320525deb15bdd32f35c1a1ac4699be..0000000000000000000000000000000000000000 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/data/tuner_confs.txt +++ /dev/null @@ -1,91 +0,0 @@ -2000 -+++++ -conf1 1 0 89.59 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 -3 gpu conv fp32 1 add fp32 1 -4 gpu add fp32 1 -5 gpu relu fp32 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 -8 gpu add fp32 1 -9 gpu relu fp32 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 -11 gpu conv fp32 1 add fp32 1 -12 gpu add fp32 1 -13 gpu relu fp32 1 -14 gpu conv fp32 1 add fp32 1 relu fp32 1 -15 gpu conv fp32 1 add fp32 1 -16 gpu conv fp32 1 add fp32 1 -17 gpu add fp32 1 -18 gpu relu fp32 1 -19 gpu conv fp32 1 add fp32 1 relu fp32 1 -20 gpu conv fp32 1 add fp32 1 -21 gpu add fp32 1 -22 gpu relu fp32 1 -23 gpu conv fp32 1 add fp32 1 relu fp32 1 -24 gpu conv fp32 1 add fp32 1 -25 gpu add fp32 1 -26 gpu relu fp32 1 -27 gpu conv fp32 1 add fp32 1 relu fp32 1 -28 gpu conv fp32 1 add fp32 1 -29 gpu conv fp32 1 add fp32 1 -30 gpu add fp32 1 -31 gpu relu fp32 1 -32 gpu conv fp32 1 add fp32 1 relu fp32 1 -33 gpu conv fp32 1 add fp32 1 -34 gpu add fp32 1 -35 gpu relu fp32 1 -36 gpu conv fp32 1 add fp32 1 relu fp32 1 -37 gpu conv fp32 1 add fp32 1 -38 gpu add fp32 1 -39 gpu relu fp32 1 -40 gpu pool_mean fp32 1 -41 gpu mul fp32 1 add fp32 1 -42 gpu softmax fp32 1 ------ -+++++ -conf2 1.5 0 89.59 0 -1 gpu conv fp16 1 add fp16 1 relu fp16 1 -2 gpu conv fp16 1 add fp16 1 relu fp16 1 -3 gpu conv fp16 1 add fp16 1 -4 gpu add fp16 1 -5 gpu relu fp16 1 -6 gpu conv fp16 1 add fp16 1 relu fp16 1 -7 gpu conv fp16 1 add fp16 1 -8 gpu add fp16 1 -9 gpu relu fp16 1 -10 gpu conv fp16 1 add fp16 1 relu fp16 1 -11 gpu conv fp16 1 add fp16 1 -12 gpu add fp16 1 -13 gpu relu fp16 1 -14 gpu conv fp16 1 add fp16 1 relu fp16 1 -15 gpu conv fp16 1 add fp16 1 -16 gpu conv fp16 1 add fp16 1 -17 gpu add fp16 1 -18 gpu relu fp16 1 -19 gpu conv fp16 1 add fp16 1 relu fp16 1 -20 gpu conv fp16 1 add fp16 1 -21 gpu add fp16 1 -22 gpu relu fp16 1 -23 gpu conv fp16 1 add fp16 1 relu fp16 1 -24 gpu conv fp16 1 add fp16 1 -25 gpu add fp16 1 -26 gpu relu fp16 1 -27 gpu conv fp16 1 add fp16 1 relu fp16 1 -28 gpu conv fp16 1 add fp16 1 -29 gpu conv fp16 1 add fp16 1 -30 gpu add fp16 1 -31 gpu relu fp16 1 -32 gpu conv fp16 1 add fp16 1 relu fp16 1 -33 gpu conv fp16 1 add fp16 1 -34 gpu add fp16 1 -35 gpu relu fp16 1 -36 gpu conv fp16 1 add fp16 1 relu fp16 1 -37 gpu conv fp16 1 add fp16 1 -38 gpu add fp16 1 -39 gpu relu fp16 1 -40 gpu pool_mean fp16 1 -41 gpu mul fp16 1 add fp16 1 -42 gpu softmax fp32 1 ------ diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/data/quant_ranges_rt.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/quant_ranges_rt.txt similarity index 100% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/data/quant_ranges_rt.txt rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/quant_ranges_rt.txt diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/tuner_confs.txt new file mode 100644 index 0000000000000000000000000000000000000000..654cffbf632686dca6310a93ecf56b6521e32039 --- /dev/null +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/data/tuner_confs.txt @@ -0,0 +1,2296 @@ +2484.981244 ++++++ +conf1 1 1 89.56 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 +3 gpu conv fp32 11 add fp32 1 +4 gpu add fp32 11 +5 gpu relu fp32 11 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 +8 gpu add fp32 11 +9 gpu relu fp32 11 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 +11 gpu conv fp32 11 add fp32 1 +12 gpu add fp32 11 +13 gpu relu fp32 11 +14 gpu conv fp32 11 add fp32 1 relu fp32 1 +15 gpu conv fp32 11 add fp32 1 +16 gpu conv fp32 11 add fp32 1 +17 gpu add fp32 11 +18 gpu relu fp32 11 +19 gpu conv fp32 11 add fp32 1 relu fp32 1 +20 gpu conv fp32 11 add fp32 1 +21 gpu add fp32 11 +22 gpu relu fp32 11 +23 gpu conv fp32 11 add fp32 1 relu fp32 1 +24 gpu conv fp32 11 add fp32 1 +25 gpu add fp32 11 +26 gpu relu fp32 11 +27 gpu conv fp32 11 add fp32 1 relu fp32 1 +28 gpu conv fp32 11 add fp32 1 +29 gpu conv fp32 11 add fp32 1 +30 gpu add fp32 11 +31 gpu relu fp32 11 +32 gpu conv fp32 11 add fp32 1 relu fp32 1 +33 gpu conv fp32 11 add fp32 1 +34 gpu add fp32 11 +35 gpu relu fp32 11 +36 gpu conv fp32 11 add fp32 1 relu fp32 1 +37 gpu conv fp32 11 add fp32 1 +38 gpu add fp32 11 +39 gpu relu fp32 11 +40 gpu pool_mean fp32 11 +41 gpu mul fp32 11 add fp32 1 +42 gpu softmax fp32 1 +----- ++++++ +conf2 1.767527790869615 1.7962938589450996 88.96 0.6000000000000085 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf3 1.7676486174436143 1.7967155014984917 88.78 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf4 1.7674352647250422 1.792910560846682 88.7 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf5 1.8655703338511067 1.8930089896922888 88.53999999999999 1.0200000000000102 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 167 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 159 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 157 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf6 1.9070428103729684 1.9172857853336078 88.38000000000001 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv samp_fp16 261 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf7 1.769778590701739 1.7956222622694236 88.24 1.3200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv fp16 12 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 268 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf8 1.841404652091802 1.8677947628418006 88.24 1.3200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 162 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf9 1.8679349428783487 1.8995927920729931 88.22 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 160 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 161 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf10 1.876937310100899 1.9041581451399825 88.1 1.460000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf11 1.842140004857965 1.8673692956620238 88.06 1.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 167 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf12 1.9070567138857761 1.9165525910492667 88.02 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 261 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf13 1.9185835698271805 1.9328202469403 87.98 1.5799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf14 1.781744853993609 1.8082995958456516 87.92 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 168 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 159 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 265 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv samp_fp16 268 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf15 1.9185835698271805 1.9328202469403 87.92 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf16 1.875261840315855 1.8986912653657988 87.88 1.6800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 159 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf17 1.9013559086026153 1.9230901214481015 87.86 1.7000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf18 1.9185835698271805 1.9328202469403 87.83999999999999 1.720000000000013 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf19 1.8770503055325798 1.9007923328014182 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 151 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf20 1.8774136276932418 1.90365663123621 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf21 1.943143041264842 1.9591958561422729 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf22 1.870789918969847 1.8863625217899933 87.8 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 264 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf23 1.7445941809066292 1.7754934270309912 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf24 1.9065930313550916 1.928938946228637 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 167 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf25 1.9021824494907031 1.9237134505552098 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 154 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf26 1.9017271009017505 1.9211078231701697 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf27 1.8187224917656395 1.820406007609536 87.76 1.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf28 1.9070855899343322 1.9285210655709735 87.76 1.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 268 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf29 1.9013559086026153 1.9230901214481015 87.74 1.8200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf30 1.8772990284718367 1.9022146647342513 87.72 1.8400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf31 1.9013559086026153 1.9230901214481015 87.68 1.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf32 1.9020502478364545 1.923319572598976 87.66000000000001 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf33 1.7516394053514481 1.7809034526471939 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf34 1.7814953252955337 1.8122658147993431 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf35 1.887538247557846 1.9103369445911678 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf36 1.9107566783735581 1.9273803227885578 87.6 1.960000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf37 1.9013559086026153 1.9230901214481015 87.58 1.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf38 1.8984089819969947 1.9195632881772446 87.58 1.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf39 1.9020502478364545 1.923319572598976 87.52 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf40 1.9020502478364545 1.923319572598976 87.52 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf41 1.9013559086026153 1.9230901214481015 87.5 2.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf42 1.9013559086026153 1.9230901214481015 87.46000000000001 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf43 1.9196179152539186 1.9443459719929068 87.44 2.1200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 153 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf44 1.9020502478364545 1.923319572598976 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf45 1.9152817031040366 1.9357432559063958 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf46 1.915754791147898 1.9373322475753219 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf47 1.9130551004051772 1.9409232417921056 87.38 2.180000000000007 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 153 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf48 1.9421147660673033 1.9584555432766413 87.38 2.180000000000007 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf49 1.9052849920081363 1.9300100333661123 87.32 2.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 153 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf50 1.9154322863033566 1.934908329027621 87.3 2.260000000000005 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf51 1.9079703554020564 1.9287218218306195 86.96000000000001 2.5999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 261 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp index 6bf5a58135d7fe7101c359a29f8909937d9bc8c7..c6fa02c784f90f8c03a81991763e533d864b9ed0 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10.cpp @@ -1303,9 +1303,9 @@ typedef struct __attribute__((__packed__)) { int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); // void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); // uint32_t* labels = readLabels3(labels_path.c_str(),5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp similarity index 99% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp index d9f96cfdac18876b676369ba2c7c0e8f4e2ea986..e1429ada17629aa7d889b882f23817943a36dabf 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18/resnet18.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet18_cifar10/resnet18_cifar10_cudnn.cpp @@ -1230,9 +1230,9 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet18_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); void *input = readTrainedWeights(input_path.c_str(), 0, 5000, 3, 32, 32); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); uint32_t *labels = readLabels3(labels_path.c_str(), 5000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/data/tuner_confs.txt index ede27ce6f5952d4d1be47640a46771d1f4c51ab2..00cdaa41b4e1464ce50d9fd3123e9927d384c82f 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/data/tuner_confs.txt @@ -1,6 +1,6 @@ 7161.053769000008 +++++ -conf1 1 1 75.7 0.0 +conf1 1 1 75.32 0.0 1 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 2 gpu batchnorm fp32 11 3 gpu conv fp32 11 add fp32 1 @@ -175,3 +175,8451 @@ conf1 1 1 75.7 0.0 172 gpu mul fp32 11 add fp32 1 173 gpu softmax fp32 1 ----- ++++++ +conf2 1.8254789092281507 1.4527803526239977 75.32 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf3 1.8254789092281507 1.4527803526239977 75.32 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf4 1.8254789092281507 1.4527803526239977 75.32 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf5 1.8254789092281507 1.4527803526239977 75.32 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf6 1.8419337054716958 1.466078052616739 75.26 0.05999999999998806 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf7 1.8974679809809625 1.5066897277042535 74.96000000000001 0.3599999999999852 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv perf_fp16 159 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf8 1.9117100497773867 1.5188889983986897 74.9 0.4199999999999875 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf9 1.8788892069108545 1.49437915080417 74.88 0.4399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf10 1.9129474764472596 1.5199159459514724 74.8 0.519999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv perf_fp16 164 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf11 1.9036033357306685 1.5117151340312875 74.78 0.539999999999992 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf12 1.9097709579523556 1.5162774145656248 74.72 0.5999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 264 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 11 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 151 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 151 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf13 1.9082387455721863 1.5150341477557379 74.64 0.6799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 266 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 161 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf14 1.9043183204776548 1.5123751159548935 74.53999999999999 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv perf_fp16 160 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf15 1.911873579525442 1.5191135547932502 74.53999999999999 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv samp_fp16 269 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf16 1.911911301499822 1.5189487980542649 74.53999999999999 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 167 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 168 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf17 1.9138302441054342 1.5208384761235563 74.48 0.8399999999999892 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv perf_fp16 157 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf18 1.920653968544841 1.5256274583646248 74.46000000000001 0.8599999999999852 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv samp_fp16 269 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 155 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf19 1.9116658035591094 1.518367700908999 74.42 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 152 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 11 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf20 1.9195438243098482 1.5247458994649956 74.42 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 155 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf21 1.924209873605691 1.529064591928164 74.42 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 168 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf22 1.9140557316947615 1.521529863075794 74.4 0.9199999999999875 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 161 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf23 1.9114047568425236 1.5184194647871838 74.38 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 11 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 11 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv samp_fp16 269 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf24 1.9204834738925933 1.525056866197883 74.33999999999999 0.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 151 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf25 1.9122466279825416 1.5172238012346289 74.28 1.039999999999992 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf26 1.9130314390285827 1.5207295430525434 74.24 1.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv samp_fp16 268 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf27 1.9166272704270864 1.5223850695108294 74.14 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv samp_fp16 266 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 163 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf28 1.9163461999331817 1.52290306465743 74.1 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 159 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf29 1.9219056868545972 1.5245140000533106 74.1 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv perf_fp16 156 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf30 1.9076806298843183 1.5141846934788452 74.06 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 264 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf31 1.925527532583144 1.5292715219953403 74.06 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 161 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 168 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf32 1.9189849393865428 1.5211446749831168 74.03999999999999 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 158 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf33 1.9261760811190132 1.5281398179643688 74.03999999999999 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 154 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf34 1.9174841207024922 1.5247160726767368 74.02 1.2999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv samp_fp16 267 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 159 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf35 1.9237430439803298 1.5261557726743038 73.94 1.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 11 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf36 1.9412010104587538 1.5383181588064136 73.76 1.559999999999988 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 156 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 168 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 159 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv perf_fp16 154 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 166 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf37 1.9213677476309399 1.526341619045396 73.68 1.6399999999999864 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 269 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 11 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 165 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv samp_fp16 266 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 163 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 161 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 153 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf38 1.9309095603687567 1.5301996964160127 73.66 1.6599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv perf_fp16 163 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf39 1.9354685523528088 1.5339220906607207 73.66 1.6599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 154 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf40 1.9354685523528088 1.5339220906607207 73.64 1.6799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 154 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf41 1.9254999885234525 1.5304882539227977 73.6 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 162 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv samp_fp16 262 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 165 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 152 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv samp_fp16 266 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 153 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf42 1.9370999456083975 1.5356532604550897 73.6 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 154 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv perf_fp16 162 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv perf_fp16 163 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv perf_fp16 168 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf43 1.9340192560065903 1.532481178151379 73.58 1.7399999999999949 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf44 1.9355399075622302 1.533804239422373 73.56 1.759999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 11 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 151 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf45 1.9308694189904472 1.530196685542938 73.52 1.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv perf_fp16 164 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf46 1.936334272517719 1.5347856729099039 73.46000000000001 1.8599999999999852 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv perf_fp16 166 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf47 1.9361393106517242 1.5343020436847534 73.28 2.039999999999992 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv perf_fp16 157 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf48 1.9468233956242322 1.5418751866142502 73.26 2.059999999999988 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv perf_fp16 162 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 11 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv perf_fp16 157 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 157 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf49 1.9308405765402588 1.530278084775729 73.24000000000001 2.079999999999984 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv perf_fp16 152 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 11 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 11 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv perf_fp16 164 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv perf_fp16 158 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 11 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv perf_fp16 157 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv perf_fp16 165 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 11 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv perf_fp16 164 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv perf_fp16 154 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 11 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv perf_fp16 162 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 158 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv perf_fp16 161 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp index c4bd6be08b5afad0367e93f640c54b45e7d41938..b41e0bc96df83a91f5656e7094e914e8d86e6df5 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet.cpp @@ -11,32 +11,36 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(1); void *r = __hpvm__tensor_convolution(t1, t2, 3, 3, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(2); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(3); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(4); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); @@ -45,24 +49,27 @@ void var_3_node(void *t1, size_t bytes_t1) { void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(5); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(6); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_6_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(7); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -71,32 +78,36 @@ void var_6_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(8); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_8_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(9); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(10); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(11); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -105,32 +116,36 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(12); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(13); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_13_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(14); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(15); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -139,24 +154,27 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(16); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_16_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(17); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(18); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -165,40 +183,45 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(19); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(20); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_20_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(21); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(22); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(23); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -207,32 +230,36 @@ void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_23_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(24); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_24_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(25); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(26); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_26_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(27); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -241,32 +268,36 @@ void var_26_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(28); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_28_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(29); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_29_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(30); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(31); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -275,40 +306,45 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(32); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_32_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(33); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(34); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(35); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(36); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -317,32 +353,36 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_36_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(37); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_37_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(38); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(39); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_39_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(40); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -351,32 +391,36 @@ void var_39_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(41); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_41_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(42); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_42_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(43); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_43_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(44); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -385,40 +429,45 @@ void var_43_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(45); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(46); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(47); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(48); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(49); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -427,32 +476,36 @@ void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_49_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(50); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_50_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(51); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_51_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(52); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_52_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(53); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -461,32 +514,36 @@ void var_52_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_53_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(54); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_54_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(55); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_55_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(56); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_56_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(57); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -495,24 +552,27 @@ void var_56_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_57_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(58); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_58_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(59); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_59_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(60); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -521,40 +581,45 @@ void var_59_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_60_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(61); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_61_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(62); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_62_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(63); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_63_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(64); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_64_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(65); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -563,32 +628,36 @@ void var_64_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_65_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(66); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_66_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(67); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_67_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(68); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_68_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(69); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -597,32 +666,36 @@ void var_68_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_69_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(70); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_70_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(71); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_71_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(72); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_72_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(73); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -631,40 +704,45 @@ void var_72_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_73_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(74); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_74_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(75); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_75_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(76); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_76_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(77); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_77_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(78); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -673,32 +751,36 @@ void var_77_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_78_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(79); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_79_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(80); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_80_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(81); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_81_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(82); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -707,32 +789,36 @@ void var_81_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_82_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(83); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_83_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(84); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_84_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(85); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_85_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(86); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -741,40 +827,45 @@ void var_85_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_86_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(87); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_87_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(88); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_88_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(89); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_89_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(90); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_90_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(91); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -783,32 +874,36 @@ void var_90_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_91_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(92); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_92_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(93); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_93_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(94); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_94_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(95); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -817,32 +912,36 @@ void var_94_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_95_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(96); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_96_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(97); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_97_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(98); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_98_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(99); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -851,40 +950,45 @@ void var_98_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_99_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(100); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_100_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(101); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_101_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(102); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_102_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(103); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_103_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(104); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -893,32 +997,36 @@ void var_103_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_104_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(105); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_105_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(106); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_106_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(107); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_107_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(108); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -927,32 +1035,36 @@ void var_107_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_108_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(109); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_109_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(110); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_110_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(111); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_111_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(112); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -961,24 +1073,27 @@ void var_111_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_112_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(113); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_113_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(114); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_114_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(115); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -987,40 +1102,45 @@ void var_114_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_115_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(116); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_116_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(117); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_117_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(118); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_118_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(119); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_119_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(120); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1029,32 +1149,36 @@ void var_119_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_120_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(121); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_121_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(122); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_122_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(123); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_123_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(124); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1063,32 +1187,36 @@ void var_123_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_124_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(125); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_125_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(126); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_126_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(127); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_127_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(128); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1097,40 +1225,45 @@ void var_127_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_128_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(129); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_129_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(130); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_130_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(131); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_131_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(132); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_132_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(133); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1139,32 +1272,36 @@ void var_132_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_133_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(134); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_134_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(135); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_135_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(136); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_136_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(137); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1173,32 +1310,36 @@ void var_136_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_137_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(138); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_138_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(139); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_139_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(140); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_140_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(141); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1207,40 +1348,45 @@ void var_140_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_141_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(142); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_142_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(143); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_143_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(144); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_144_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(145); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_145_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(146); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1249,32 +1395,36 @@ void var_145_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_146_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(147); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_147_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(148); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_148_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(149); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_149_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(150); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1283,32 +1433,36 @@ void var_149_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_150_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(151); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_151_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(152); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_152_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(153); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_153_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(154); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1317,40 +1471,45 @@ void var_153_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_154_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(155); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_155_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(156); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_156_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(157); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_157_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(158); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_158_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(159); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1359,32 +1518,36 @@ void var_158_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_159_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(160); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_160_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(161); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_161_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(162); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_162_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(163); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1393,32 +1556,36 @@ void var_162_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_163_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(164); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_164_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(165); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_165_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(166); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_166_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(167); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1427,40 +1594,45 @@ void var_166_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_167_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(168); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_168_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(169); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_169_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(170); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_170_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(171); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_171_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(172); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1469,32 +1641,36 @@ void var_171_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_172_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(173); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_173_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(174); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_174_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(175); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_175_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(176); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1503,32 +1679,36 @@ void var_175_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_176_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(177); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_177_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(178); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_178_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(179); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_179_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(180); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1537,40 +1717,45 @@ void var_179_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_180_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(181); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_181_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(182); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_182_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(183); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_183_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(184); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_184_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(185); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1579,32 +1764,36 @@ void var_184_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_185_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(186); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_186_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(187); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_187_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(188); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_188_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(189); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1613,32 +1802,36 @@ void var_188_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_189_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(190); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_190_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(191); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_191_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(192); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_192_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(193); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1647,24 +1840,27 @@ void var_192_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_193_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(194); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_194_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(195); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_195_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(196); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1673,40 +1869,45 @@ void var_195_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_196_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(197); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_197_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(198); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_198_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(199); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_199_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(200); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_200_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(201); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1715,32 +1916,36 @@ void var_200_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_201_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(202); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_202_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(203); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_203_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(204); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_204_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(205); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1749,32 +1954,36 @@ void var_204_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_205_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(206); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_206_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(207); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_207_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(208); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_208_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(209); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1783,40 +1992,45 @@ void var_208_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_209_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(210); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_210_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(211); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_211_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(212); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_212_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(213); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_213_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(214); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1825,32 +2039,36 @@ void var_213_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_214_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(215); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_215_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(216); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_216_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(217); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_217_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(218); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1859,32 +2077,36 @@ void var_217_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_218_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(219); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_219_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(220); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_220_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(221); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_221_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(222); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1893,56 +2115,63 @@ void var_221_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_222_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); + __hpvm__node_id(223); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_223_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(224); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_224_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(225); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_225_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(226); void *r = __hpvm__tensor_pool_mean(t1, 7, 7, 0, 0, 7, 7); __hpvm__return(2, r, (size_t)0); } void var_226_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(227); void *r = __hpvm__tensor_mul(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_227_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); + __hpvm__node_id(228); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_228_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); + __hpvm__node_id(229); void *r = __hpvm__tensor_softmax(t1); __hpvm__return(2, r, (size_t)0); @@ -4907,8 +5136,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7); @@ -6081,14 +6310,13 @@ int main() { std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224); - uint32_t *labels = readLabels3(labels_path.c_str(), 100); + + // void* input = readTrainedWeights(input_path.c_str(), 0,100,3,224,224); + // uint32_t* labels = readLabelsBatch3(labels_path.c_str(),0,100); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -6730,14 +6958,39 @@ int main() { args->dense_1_b = dense_1_b; args->dense_1_b_bytes = 0; - void *dfg = __hpvm__launch(0, root, (void *)args); + startMemTracking(); + startProfiling(); + + unsigned int batch_size = 50; + unsigned int test_input_size = 1000; + unsigned int batch_count = test_input_size / batch_size; + + for (int j = 0; j < 1; j++) { + for (int i = 0; i < batch_count; i++) { + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + args->input = input; + args->input_bytes = 0; - __hpvm__wait(dfg); + void *dfg = __hpvm__launch(0, root, (void *)args); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + __hpvm__wait(dfg); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + + freeBatchMemory(); + } + } + + stopProfiling(); __hpvm__cleanup(); - computeAccuracy3(labels, result); + return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp similarity index 94% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp index 42bad74ac39511a64ee4fd20e589cec5caf14836..ab613983a0a57673a2575378b6a9a2a3fc04f941 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/resnet50_imagenet/resnet50_imagenet_cudnn.cpp @@ -11,36 +11,32 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(1); void *r = __hpvm__tensor_convolution(t1, t2, 3, 3, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(2); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(3); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_3_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(4); void *r = __hpvm__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); @@ -49,27 +45,24 @@ void var_3_node(void *t1, size_t bytes_t1) { void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(5); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_5_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(6); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_6_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(7); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -78,36 +71,32 @@ void var_6_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(8); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_8_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(9); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_9_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(10); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(11); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -116,36 +105,32 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(12); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(13); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_13_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(14); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(15); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -154,27 +139,24 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(16); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_16_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(17); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(18); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -183,45 +165,40 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(19); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_19_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(20); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_20_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(21); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(22); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(23); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -230,36 +207,32 @@ void var_22_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_23_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(24); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_24_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(25); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(26); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_26_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(27); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -268,36 +241,32 @@ void var_26_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(28); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_28_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(29); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_29_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(30); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(31); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -306,45 +275,40 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(32); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_32_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(33); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(34); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(35); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(36); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -353,36 +317,32 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_36_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(37); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_37_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(38); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(39); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_39_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(40); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -391,36 +351,32 @@ void var_39_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(41); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_41_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(42); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_42_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(43); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_43_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(44); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -429,45 +385,40 @@ void var_43_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(45); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(46); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(47); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(48); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(49); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -476,36 +427,32 @@ void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_49_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(50); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_50_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(51); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_51_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(52); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_52_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(53); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -514,36 +461,32 @@ void var_52_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_53_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(54); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_54_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(55); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_55_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(56); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_56_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(57); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -552,27 +495,24 @@ void var_56_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_57_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(58); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_58_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(59); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_59_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(60); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -581,45 +521,40 @@ void var_59_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_60_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(61); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_61_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(62); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_62_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(63); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_63_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(64); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_64_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(65); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -628,36 +563,32 @@ void var_64_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_65_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(66); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_66_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(67); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_67_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(68); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_68_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(69); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -666,36 +597,32 @@ void var_68_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_69_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(70); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_70_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(71); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_71_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(72); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_72_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(73); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -704,45 +631,40 @@ void var_72_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_73_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(74); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_74_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(75); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_75_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(76); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_76_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(77); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_77_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(78); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -751,36 +673,32 @@ void var_77_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_78_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(79); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_79_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(80); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_80_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(81); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_81_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(82); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -789,36 +707,32 @@ void var_81_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_82_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(83); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_83_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(84); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_84_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(85); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_85_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(86); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -827,45 +741,40 @@ void var_85_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_86_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(87); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_87_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(88); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_88_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(89); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_89_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(90); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_90_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(91); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -874,36 +783,32 @@ void var_90_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_91_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(92); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_92_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(93); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_93_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(94); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_94_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(95); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -912,36 +817,32 @@ void var_94_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_95_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(96); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_96_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(97); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_97_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(98); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_98_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(99); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -950,45 +851,40 @@ void var_98_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_99_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(100); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_100_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(101); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_101_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(102); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_102_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(103); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_103_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(104); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -997,36 +893,32 @@ void var_103_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_104_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(105); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_105_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(106); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_106_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(107); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_107_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(108); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1035,36 +927,32 @@ void var_107_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_108_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(109); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_109_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(110); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_110_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(111); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_111_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(112); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1073,27 +961,24 @@ void var_111_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_112_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(113); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_113_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(114); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_114_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(115); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1102,45 +987,40 @@ void var_114_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_115_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(116); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_116_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(117); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_117_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(118); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_118_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(119); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_119_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(120); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1149,36 +1029,32 @@ void var_119_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_120_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(121); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_121_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(122); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_122_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(123); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_123_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(124); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1187,36 +1063,32 @@ void var_123_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_124_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(125); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_125_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(126); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_126_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(127); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_127_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(128); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1225,45 +1097,40 @@ void var_127_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_128_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(129); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_129_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(130); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_130_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(131); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_131_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(132); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_132_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(133); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1272,36 +1139,32 @@ void var_132_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_133_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(134); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_134_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(135); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_135_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(136); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_136_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(137); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1310,36 +1173,32 @@ void var_136_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_137_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(138); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_138_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(139); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_139_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(140); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_140_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(141); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1348,45 +1207,40 @@ void var_140_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_141_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(142); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_142_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(143); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_143_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(144); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_144_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(145); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_145_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(146); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1395,36 +1249,32 @@ void var_145_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_146_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(147); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_147_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(148); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_148_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(149); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_149_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(150); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1433,36 +1283,32 @@ void var_149_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_150_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(151); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_151_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(152); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_152_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(153); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_153_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(154); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1471,45 +1317,40 @@ void var_153_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_154_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(155); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_155_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(156); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_156_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(157); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_157_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(158); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_158_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(159); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1518,36 +1359,32 @@ void var_158_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_159_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(160); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_160_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(161); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_161_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(162); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_162_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(163); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1556,36 +1393,32 @@ void var_162_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_163_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(164); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_164_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(165); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_165_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(166); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_166_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(167); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1594,45 +1427,40 @@ void var_166_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_167_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(168); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_168_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(169); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_169_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(170); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_170_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(171); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_171_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(172); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1641,36 +1469,32 @@ void var_171_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_172_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(173); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_173_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(174); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_174_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(175); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_175_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(176); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1679,36 +1503,32 @@ void var_175_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_176_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(177); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_177_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(178); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_178_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(179); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_179_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(180); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1717,45 +1537,40 @@ void var_179_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_180_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(181); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_181_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(182); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_182_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(183); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_183_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(184); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_184_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(185); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1764,36 +1579,32 @@ void var_184_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_185_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(186); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_186_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(187); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_187_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(188); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_188_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(189); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1802,36 +1613,32 @@ void var_188_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_189_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(190); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_190_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(191); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_191_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(192); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_192_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(193); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1840,27 +1647,24 @@ void var_192_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_193_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(194); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_194_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(195); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 2, 2); __hpvm__return(2, r, (size_t)0); } void var_195_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(196); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1869,45 +1673,40 @@ void var_195_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_196_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(197); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_197_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(198); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_198_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(199); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_199_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(200); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_200_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(201); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1916,36 +1715,32 @@ void var_200_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_201_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(202); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_202_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(203); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_203_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(204); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_204_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(205); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1954,36 +1749,32 @@ void var_204_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_205_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(206); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_206_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(207); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_207_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(208); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_208_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(209); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -1992,45 +1783,40 @@ void var_208_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_209_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(210); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_210_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(211); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_211_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(212); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_212_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(213); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_213_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(214); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -2039,36 +1825,32 @@ void var_213_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_214_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(215); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_215_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(216); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_216_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(217); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_217_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(218); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -2077,36 +1859,32 @@ void var_217_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_218_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(219); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_219_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(220); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_220_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(221); void *r = __hpvm__tensor_convolution(t1, t2, 0, 0, 1, 1); __hpvm__return(2, r, (size_t)0); } void var_221_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(222); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); @@ -2115,63 +1893,56 @@ void var_221_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { void var_222_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2, void *t3, size_t bytes_t3, void *t4, size_t bytes_t4, void *t5, size_t bytes_t5) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(5, t1, t2, t3, t4, t5, 0); - __hpvm__node_id(223); void *r = __hpvm__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __hpvm__return(2, r, (size_t)0); } void var_223_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(224); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_224_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(225); void *r = __hpvm__tensor_relu(t1); __hpvm__return(2, r, (size_t)0); } void var_225_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(226); void *r = __hpvm__tensor_pool_mean(t1, 7, 7, 0, 0, 7, 7); __hpvm__return(2, r, (size_t)0); } void var_226_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(227); void *r = __hpvm__tensor_mul(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_227_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); - __hpvm__node_id(228); void *r = __hpvm__tensor_add(t1, t2); __hpvm__return(2, r, (size_t)0); } void var_228_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); - __hpvm__node_id(229); void *r = __hpvm__tensor_softmax(t1); __hpvm__return(2, r, (size_t)0); @@ -5136,8 +4907,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/resnet50_imagenet/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7); @@ -6310,13 +6081,14 @@ int main() { std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1); - - // void* input = readTrainedWeights(input_path.c_str(), 0,100,3,224,224); - // uint32_t* labels = readLabelsBatch3(labels_path.c_str(),0,100); + void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224); + uint32_t *labels = readLabels3(labels_path.c_str(), 100); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -6958,39 +6730,14 @@ int main() { args->dense_1_b = dense_1_b; args->dense_1_b_bytes = 0; - startMemTracking(); - startProfiling(); - - unsigned int batch_size = 50; - unsigned int test_input_size = 1000; - unsigned int batch_count = test_input_size / batch_size; - - for (int j = 0; j < 1; j++) { - for (int i = 0; i < batch_count; i++) { - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void *input = - readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); - - args->input = input; - args->input_bytes = 0; + void *dfg = __hpvm__launch(0, root, (void *)args); - void *dfg = __hpvm__launch(0, root, (void *)args); + __hpvm__wait(dfg); - __hpvm__wait(dfg); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); - - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - - freeBatchMemory(); - } - } - - stopProfiling(); __hpvm__cleanup(); - + computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/data/tuner_confs.txt index c9a6612a5df150f58c69e1a7faeaf83ed5c7d605..2b325a9fe2d122e74cdd2b80e2768e68591313bf 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/data/tuner_confs.txt @@ -1,38 +1,913 @@ +3776.508929999999 +++++ -conf1 1 0 90.19 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -3 gpu conv fp32 1 add fp32 1 relu fp32 1 -4 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -5 gpu conv fp32 1 add fp32 1 relu fp32 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -8 gpu conv fp32 1 add fp32 1 relu fp32 1 -9 gpu conv fp32 1 add fp32 1 relu fp32 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -11 gpu conv fp32 1 add fp32 1 relu fp32 1 -12 gpu conv fp32 1 add fp32 1 relu fp32 1 -13 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -14 gpu mul fp32 1 add fp32 1 relu fp32 1 -15 gpu mul fp32 1 add fp32 1 -16 gpu softmax fp32 1 ------ -+++++ -conf2 1.5 0 90.19 0 -1 gpu conv fp16 1 add fp16 1 relu fp16 1 -2 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -3 gpu conv fp16 1 add fp16 1 relu fp16 1 -4 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -5 gpu conv fp16 1 add fp16 1 relu fp16 1 -6 gpu conv fp16 1 add fp16 1 relu fp16 1 -7 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -8 gpu conv fp16 1 add fp16 1 relu fp16 1 -9 gpu conv fp16 1 add fp16 1 relu fp16 1 -10 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -11 gpu conv fp16 1 add fp16 1 relu fp16 1 -12 gpu conv fp16 1 add fp16 1 relu fp16 1 -13 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -14 gpu mul fp16 1 add fp16 1 relu fp16 1 -15 gpu mul fp16 1 add fp16 1 +conf1 1 1 89.96 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.1225958306417145 1.9771056444390926 89.91 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf3 2.090180991844805 1.9532689756636086 89.82 0.14000000000000057 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf4 2.169931036393396 2.0048851858669283 89.53999999999999 0.4200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf5 2.1012179398201756 1.9325098819632314 89.42 0.539999999999992 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf6 2.2313002482945326 2.069581185407626 89.38000000000001 0.5799999999999841 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf7 2.143061101834193 1.9675759235961738 89.3 0.6599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf8 2.199379444387758 2.0314348091429677 89.2 0.7599999999999909 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf9 2.3236298452294624 2.156907976575644 89.03999999999999 0.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf10 2.3224369486241603 2.1560351277882046 89.03999999999999 0.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf11 2.358467412507993 2.1904290636262784 89.02 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf12 2.3633503986583126 2.1980949050120437 88.88000000000001 1.079999999999984 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf13 2.4903388172036043 2.3063593441573564 88.82 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf14 2.508156996742662 2.3204109539869595 88.78 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf15 2.4818531813049622 2.2910866330696744 88.75999999999999 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf16 2.4591564896606 2.272664410995804 88.74 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf17 2.5370582721089496 2.3464665753522405 88.72 1.2399999999999949 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf18 2.438100014978735 2.257620696759345 88.7 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf19 2.4776935382337006 2.2949598026093168 88.7 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf20 2.4380041604279596 2.254330054479329 88.68 1.279999999999987 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf21 2.4745444350223327 2.2883888475386525 88.64 1.3199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf22 2.4136652022060625 2.2360545757445407 88.52 1.4399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf23 2.510093966915115 2.316437144001897 88.52 1.4399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf24 2.475990790728594 2.28127562431577 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf25 2.4761929121466926 2.290365501363375 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf26 2.4763575559033875 2.291312348847263 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf27 2.600249602991055 2.4123747341424644 88.06 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf28 2.596077615026303 2.4115375655840245 88.02 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf29 2.580888020555937 2.3840829703999833 87.88 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf30 2.556352783745439 2.3641413704751537 87.8 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf31 2.5559756082494527 2.3677471703724575 87.78 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 11 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf32 2.597413373332546 2.4091972878097585 87.76 2.1999999999999886 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf33 2.4797467027434656 2.2874608793842612 87.74 2.219999999999999 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf34 2.593675604602072 2.400513932866452 87.7 2.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf35 2.6300759173431336 2.432687374579977 87.62 2.339999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf36 2.5907083037103864 2.4042762580264356 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf37 2.6143261650366187 2.423427684623993 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf38 2.6144436259117203 2.4231961521843344 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf39 2.662088796913144 2.4660859696742032 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf40 2.6210428708834517 2.423389791646294 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf41 2.6399924349243533 2.4443864221157914 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf42 2.616443708384916 2.4217582570150697 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf43 2.6883473596205225 2.5036952786284137 87.5 2.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf44 2.6117356623585875 2.420771216556161 87.48 2.4799999999999898 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf45 2.6359174040106708 2.444231592562593 87.48 2.4799999999999898 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf46 2.56504192294198 2.371871906722655 87.44 2.519999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf47 2.5652588453899727 2.3816996471861174 87.44 2.519999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf48 2.68806951500876 2.5007647690311425 87.14 2.819999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 16 gpu softmax fp32 1 ----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp index f1533c75b4b838f5b86dfbf915cfd359b9682636..13d150e7a946296e8ce5c7fb9e128a91dedbe534 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -830,8 +830,8 @@ typedef struct __attribute__((__packed__)) { int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -920,14 +920,14 @@ int main() { readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); - uint32_t *labels = readLabels3(labels_path.c_str(), 2000); + // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32); + // uint32_t* labels = readLabels3(labels_path.c_str(),2000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; + // args->input = input; + // args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -989,27 +989,40 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; + int batch_size = 500; + int test_input_size = 5000; + int batch_count = test_input_size / batch_size; + startMemTracking(); startProfiling(); - input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); + for (int j = 0; j < 1; j++) { + for (int i = 0; i < batch_count; i++) { - args->input = input; - args->input_bytes = 0; + int start = i * batch_size; + int end = (i + 1) * batch_size; - void *dfg = __hpvm__launch(0, root, (void *)args); + // copyInputBatch(input_path.c_str(),start,end,3,32,32, input); + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - __hpvm__wait(dfg); + args->input = input; + args->input_bytes = 0; - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + void *dfg = __hpvm__launch(0, root, (void *)args); - computeAccuracy3(labels, result); + __hpvm__wait(dfg); - freeBatchMemory(); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - stopProfiling(); + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + + freeBatchMemory(); + } + } + stopProfiling(); __hpvm__cleanup(); return 0; diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp similarity index 91% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp index 3a853d3a0f5399057164594951a884222a02e105..c1cb38327dc94938934486f3022e4e9cb360f517 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_cudnn.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -829,10 +829,9 @@ typedef struct __attribute__((__packed__)) { int main() { - std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/"; - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/"; + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -918,18 +917,17 @@ int main() { void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); void *dense_2_w = - readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); - - // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32); - // uint32_t* labels = readLabels3(labels_path.c_str(),2000); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); + void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); + uint32_t *labels = readLabels3(labels_path.c_str(), 2000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - // args->input = input; - // args->input_bytes = 0; + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -991,39 +989,27 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; - int batch_size = 500; - int test_input_size = 5000; - int batch_count = test_input_size / batch_size; - startMemTracking(); startProfiling(); - for (int j = 0; j < 14; j++) { - for (int i = 0; i < batch_count; i++) { + input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); - int start = i * batch_size; - int end = (i + 1) * batch_size; + args->input = input; + args->input_bytes = 0; - void *input = - readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + void *dfg = __hpvm__launch(0, root, (void *)args); - args->input = input; - args->input_bytes = 0; + __hpvm__wait(dfg); - void *dfg = __hpvm__launch(0, root, (void *)args); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - __hpvm__wait(dfg); + computeAccuracy3(labels, result); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); - - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - - freeBatchMemory(); - } - } + freeBatchMemory(); stopProfiling(); + __hpvm__cleanup(); return 0; diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/data/tuner_confs.txt index 2662b4ba78dc54686d61f45242fb38f4ca75402c..2c29bedd096aec2c7f66afbe729353e372fac403 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/data/tuner_confs.txt @@ -1,39 +1,970 @@ -2000 -+++++ -conf1 1 0 90.19 0 -1 gpu conv fp32 1 add fp32 1 relu fp32 1 -2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -3 gpu conv fp32 1 add fp32 1 relu fp32 1 -4 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -5 gpu conv fp32 1 add fp32 1 relu fp32 1 -6 gpu conv fp32 1 add fp32 1 relu fp32 1 -7 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -8 gpu conv fp32 1 add fp32 1 relu fp32 1 -9 gpu conv fp32 1 add fp32 1 relu fp32 1 -10 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -11 gpu conv fp32 1 add fp32 1 relu fp32 1 -12 gpu conv fp32 1 add fp32 1 relu fp32 1 -13 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 -14 gpu mul fp32 1 add fp32 1 relu fp32 1 -15 gpu mul fp32 1 add fp32 1 -16 gpu softmax fp32 1 ------ -+++++ -conf2 1.5 0 90.19 0 -1 gpu conv fp16 1 add fp16 1 relu fp16 1 -2 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -3 gpu conv fp16 1 add fp16 1 relu fp16 1 -4 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -5 gpu conv fp16 1 add fp16 1 relu fp16 1 -6 gpu conv fp16 1 add fp16 1 relu fp16 1 -7 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -8 gpu conv fp16 1 add fp16 1 relu fp16 1 -9 gpu conv fp16 1 add fp16 1 relu fp16 1 -10 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -11 gpu conv fp16 1 add fp16 1 relu fp16 1 -12 gpu conv fp16 1 add fp16 1 relu fp16 1 -13 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 -14 gpu mul fp16 1 add fp16 1 relu fp16 1 -15 gpu mul fp16 1 add fp16 1 +3768.819777999999 ++++++ +conf1 1 1 66.5 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.2877724452131787 2.08025704453875 66.45 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf3 2.5314658805383816 2.30737681453141 66.45 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf4 2.044123178914057 1.8616966918258782 66.32000000000001 0.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf5 2.231179358259141 2.0317825813373864 66.18 0.3199999999999932 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf6 2.2474834421641057 2.0338639876373272 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf7 2.22281439516094 2.0205460706906377 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf8 2.1625085012968484 1.94560449637282 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv fp16 11 add fp16 1 relu fp16 1 +10 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf9 2.639337323402163 2.3960416499256825 65.8 0.7000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf10 2.672718090670276 2.4276905528801507 65.68 0.8199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf11 2.699089631751789 2.446114054498494 65.68 0.8199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf12 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf13 2.638763904718665 2.395072565223988 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf14 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf15 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf16 2.6732183804279006 2.4287517162140326 65.62 0.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf17 2.6728394017929027 2.428768169588016 65.60000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf18 2.4549989178389238 2.2406620346549433 65.56 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf19 2.673556689244081 2.429092581627209 65.52 0.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf20 2.6525635304451756 2.406830663552284 65.5 1.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf21 2.6692288605087553 2.423462800937785 65.5 1.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf22 2.583650505571873 2.3471533059252194 65.48 1.019999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf23 2.6474572655420125 2.400471260394867 65.48 1.019999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf24 2.4710116424304736 2.2555966923178996 65.46 1.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf25 2.557911102074785 2.3292661683311526 65.46 1.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf26 2.6032957018479532 2.367574146141511 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf27 2.6029968728098916 2.3672068592437223 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf28 2.602540311129756 2.3691028781436954 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf29 2.602756708588441 2.3708111025211718 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf30 2.603240857443844 2.3662875785790183 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf31 2.602882717372841 2.368011704225619 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf32 2.67999343314603 2.4305182001043826 65.4 1.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf33 2.670314990364046 2.4275308713267485 65.38000000000001 1.1199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf34 2.650982630033638 2.405821467700663 65.36 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf35 2.6507266317871756 2.405938171802741 65.36 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf36 2.6523068534836174 2.406695716686769 65.34 1.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf37 2.6533198495191073 2.4077689394073865 65.34 1.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf38 2.64630900155657 2.4073892305914986 65.32 1.1800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf39 2.6725522534379413 2.42903505877629 65.32 1.1800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf40 2.6435249267602225 2.403536258709464 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf41 2.6442059720503557 2.4037376163252024 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf42 2.6536933126724027 2.4077527693156053 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf43 2.6442798101298948 2.4056031584129225 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf44 2.603921271336049 2.3665955131107683 65.28 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf45 2.4967248028856828 2.2748997625822716 65.25999999999999 1.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf46 2.4963953691980665 2.2764932409573166 65.25999999999999 1.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf47 2.678944927989822 2.4251978482969956 65.24 1.2600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf48 2.6727135417173904 2.428897140422096 65.22 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf49 2.600256135586627 2.355428067042657 65.16 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf50 2.264460006128871 2.058037581586567 64.9 1.5999999999999943 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf51 2.2817447204106736 2.0758846029697513 64.84 1.6599999999999966 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 16 gpu softmax fp32 1 ----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp index 41fe9ae0f34c5c5086f8c16491f5035d5a382702..6e26f89b755db90853ce90180ab179b6df421827 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -831,8 +831,8 @@ int main() { std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -922,14 +922,14 @@ int main() { std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); - uint32_t *labels = readLabels3(labels_path.c_str(), 2000); + // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32); + // uint32_t* labels = readLabels3(labels_path.c_str(),2000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; + // args->input = input; + // args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -991,14 +991,40 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; - void *dfg = __hpvm__launch(0, root, (void *)args); + int batch_size = 500; + int test_input_size = 5000; + int batch_count = test_input_size / batch_size; + + startMemTracking(); + startProfiling(); + + for (int j = 0; j < 14; j++) { + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + args->input = input; + args->input_bytes = 0; + + void *dfg = __hpvm__launch(0, root, (void *)args); + + __hpvm__wait(dfg); + + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - __hpvm__wait(dfg); - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + freeBatchMemory(); + } + } + stopProfiling(); __hpvm__cleanup(); - computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp similarity index 90% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp index 059bff6d22a51853090700072d4cf3915ed5f796..326542a03852d97dbce2dacf4da913005f9ef28a 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar10/vgg16_cifar10_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_cifar100/vgg16_cifar100_cudnn.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -829,9 +829,10 @@ typedef struct __attribute__((__packed__)) { int main() { - std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar10/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_cifar100/"; + + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -917,17 +918,18 @@ int main() { void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); void *dense_2_w = - readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); - // void* input = readTrainedWeights(input_path.c_str(), 0,2000,3,32,32); - // uint32_t* labels = readLabels3(labels_path.c_str(),2000); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); + + void *input = readTrainedWeights(input_path.c_str(), 0, 2000, 3, 32, 32); + uint32_t *labels = readLabels3(labels_path.c_str(), 2000); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - // args->input = input; - // args->input_bytes = 0; + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -989,41 +991,14 @@ int main() { args->dense_2_b = dense_2_b; args->dense_2_b_bytes = 0; - int batch_size = 500; - int test_input_size = 5000; - int batch_count = test_input_size / batch_size; - - startMemTracking(); - startProfiling(); - - for (int j = 0; j < 1; j++) { - for (int i = 0; i < batch_count; i++) { - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - // copyInputBatch(input_path.c_str(),start,end,3,32,32, input); - void *input = - readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - - args->input = input; - args->input_bytes = 0; - - void *dfg = __hpvm__launch(0, root, (void *)args); - - __hpvm__wait(dfg); - - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); - - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + void *dfg = __hpvm__launch(0, root, (void *)args); - freeBatchMemory(); - } - } + __hpvm__wait(dfg); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - stopProfiling(); __hpvm__cleanup(); + computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/data/tuner_confs.txt b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/data/tuner_confs.txt index cf93cd1286cb6f1358a46cde5991d19ab451c78a..108a101c810f4ebe488e6f2029be4d970d7869a2 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/data/tuner_confs.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/data/tuner_confs.txt @@ -19,3 +19,543 @@ conf1 1 1 72.84 0.0 16 gpu mul fp32 11 add fp32 1 17 gpu softmax fp32 1 ----- ++++++ +conf2 2.0787477568568082 1.7725701909562666 72.76 0.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf3 2.2877881266029436 1.9268677640464096 72.04 0.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf4 2.493698381711785 2.0336802939709626 72.02 0.8200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf5 2.164723960411776 1.8442442134020163 71.94 0.9000000000000057 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf6 2.53794461743687 2.069640641367895 71.67999999999999 1.1600000000000108 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf7 1.7943268128686711 1.6103705347377417 71.58 1.2600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf8 1.8143284638396158 1.6288620764171362 71.5 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf9 2.5462742331906263 2.076061630349781 71.48 1.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf10 2.526515422129153 2.063839193109964 71.39999999999999 1.440000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf11 2.1596661517243856 1.8351710968407349 71.34 1.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf12 2.3444383477958337 1.981259839350623 71.22 1.6200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf13 1.8402020049200172 1.652343405000522 71.2 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf14 2.6420417968257306 2.167425635999969 71.12 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf15 2.543198098440602 2.0805826545876145 71.1 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf16 2.6224991911009328 2.1476958232678807 70.89999999999999 1.940000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf17 2.5978010917593752 2.131515210392801 70.8 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf18 2.623210258119482 2.156636511928761 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf19 2.598187894495609 2.1322228990374104 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf20 2.640464221374653 2.1682626030871295 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf21 2.659563405662692 2.1881035849678936 70.54 2.299999999999997 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf22 2.636584103560761 2.1652496021557557 70.39999999999999 2.440000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf23 2.6315080449303547 2.161259580137757 70.38 2.460000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf24 2.7367939789033153 2.263326406058847 70.34 2.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf25 2.712182817327382 2.2404693918737233 70.24000000000001 2.5999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf26 2.660510795888948 2.187299344706456 70.22 2.6200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf27 2.457573203839654 2.0936930776435383 70.1 2.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf28 2.7452293174567757 2.2593302388139347 69.92 2.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp index f269aa9091521809751cd2214a46d039379c0114..4fad931efc4988cebdf317dc0761c9146cebab0f 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -403,7 +403,7 @@ void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_49_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -411,7 +411,7 @@ void var_49_node(void *t1, size_t bytes_t1) { } void var_50_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -419,7 +419,7 @@ void var_50_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_51_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::CUDNN_TARGET); + __hpvm__hint(hpvm::TENSOR_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -877,9 +877,10 @@ typedef struct __attribute__((__packed__)) { int main() { - std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string dir_prefix = + std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/"; + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -976,14 +977,10 @@ int main() { std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); void *dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); - void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224); - uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, 100); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); - args->input = input; - args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -1049,14 +1046,40 @@ int main() { args->dense_3_b = dense_3_b; args->dense_3_b_bytes = 0; - void *dfg = __hpvm__launch(0, root, (void *)args); + startMemTracking(); + startProfiling(); + + unsigned int batch_size = 50; + unsigned int test_input_size = 1000; + unsigned int batch_count = test_input_size / batch_size; + + for (int j = 0; j < 1; j++) { + for (int i = 0; i < batch_count; i++) { - __hpvm__wait(dfg); + int start = i * batch_size; + int end = (i + 1) * batch_size; - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + args->input = input; + args->input_bytes = 0; + + void *dfg = __hpvm__launch(0, root, (void *)args); + + __hpvm__wait(dfg); + + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); + + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); + + freeBatchMemory(); + } + } + + stopProfiling(); __hpvm__cleanup(); - computeAccuracy3(labels, result); + return 0; } diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_loop.cpp b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp similarity index 91% rename from hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_loop.cpp rename to hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp index 2bd129300adc5ffb609df1e46c951630d682b883..16f145efad6a783cd78557c871ff1348bb6689f5 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_loop.cpp +++ b/hpvm/test/dnn_benchmarks/hpvm-c/benchmarks/vgg16_imagenet/vgg16_imagenet_cudnn.cpp @@ -11,7 +11,7 @@ #include <config.h> void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -19,7 +19,7 @@ void var_0_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -27,7 +27,7 @@ void var_1_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_2_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -35,7 +35,7 @@ void var_2_node(void *t1, size_t bytes_t1) { } void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -43,7 +43,7 @@ void var_3_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -51,7 +51,7 @@ void var_4_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_5_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -59,7 +59,7 @@ void var_5_node(void *t1, size_t bytes_t1) { } void var_6_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -67,7 +67,7 @@ void var_6_node(void *t1, size_t bytes_t1) { } void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -75,7 +75,7 @@ void var_7_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -83,7 +83,7 @@ void var_8_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_9_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -91,7 +91,7 @@ void var_9_node(void *t1, size_t bytes_t1) { } void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -99,7 +99,7 @@ void var_10_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -107,7 +107,7 @@ void var_11_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_12_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -115,7 +115,7 @@ void var_12_node(void *t1, size_t bytes_t1) { } void var_13_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -123,7 +123,7 @@ void var_13_node(void *t1, size_t bytes_t1) { } void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -131,7 +131,7 @@ void var_14_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -139,7 +139,7 @@ void var_15_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_16_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -147,7 +147,7 @@ void var_16_node(void *t1, size_t bytes_t1) { } void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -155,7 +155,7 @@ void var_17_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -163,7 +163,7 @@ void var_18_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_19_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -171,7 +171,7 @@ void var_19_node(void *t1, size_t bytes_t1) { } void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -179,7 +179,7 @@ void var_20_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -187,7 +187,7 @@ void var_21_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_22_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -195,7 +195,7 @@ void var_22_node(void *t1, size_t bytes_t1) { } void var_23_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -203,7 +203,7 @@ void var_23_node(void *t1, size_t bytes_t1) { } void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -211,7 +211,7 @@ void var_24_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -219,7 +219,7 @@ void var_25_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_26_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -227,7 +227,7 @@ void var_26_node(void *t1, size_t bytes_t1) { } void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -235,7 +235,7 @@ void var_27_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -243,7 +243,7 @@ void var_28_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_29_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -251,7 +251,7 @@ void var_29_node(void *t1, size_t bytes_t1) { } void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -259,7 +259,7 @@ void var_30_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -267,7 +267,7 @@ void var_31_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_32_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -275,7 +275,7 @@ void var_32_node(void *t1, size_t bytes_t1) { } void var_33_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -283,7 +283,7 @@ void var_33_node(void *t1, size_t bytes_t1) { } void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -291,7 +291,7 @@ void var_34_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -299,7 +299,7 @@ void var_35_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_36_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -307,7 +307,7 @@ void var_36_node(void *t1, size_t bytes_t1) { } void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -315,7 +315,7 @@ void var_37_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -323,7 +323,7 @@ void var_38_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_39_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -331,7 +331,7 @@ void var_39_node(void *t1, size_t bytes_t1) { } void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_convolution(t1, t2, 1, 1, 1, 1); @@ -339,7 +339,7 @@ void var_40_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -347,7 +347,7 @@ void var_41_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_42_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -355,7 +355,7 @@ void var_42_node(void *t1, size_t bytes_t1) { } void var_43_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); @@ -363,7 +363,7 @@ void var_43_node(void *t1, size_t bytes_t1) { } void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -371,7 +371,7 @@ void var_44_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -379,7 +379,7 @@ void var_45_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_46_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -387,7 +387,7 @@ void var_46_node(void *t1, size_t bytes_t1) { } void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -395,7 +395,7 @@ void var_47_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -403,7 +403,7 @@ void var_48_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_49_node(void *t1, size_t bytes_t1) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(1, t1, 0); void *r = __hpvm__tensor_relu(t1); @@ -411,7 +411,7 @@ void var_49_node(void *t1, size_t bytes_t1) { } void var_50_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_mul(t1, t2); @@ -419,7 +419,7 @@ void var_50_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { } void var_51_node(void *t1, size_t bytes_t1, void *t2, size_t bytes_t2) { - __hpvm__hint(hpvm::TENSOR_TARGET); + __hpvm__hint(hpvm::CUDNN_TARGET); __hpvm__attributes(2, t1, t2, 0); void *r = __hpvm__tensor_add(t1, t2); @@ -877,10 +877,9 @@ typedef struct __attribute__((__packed__)) { int main() { - std::string dir_prefix = - std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/"; - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string dir_prefix = std::string(MODEL_PARAMS_DIR) + "/vgg16_imagenet/"; + std::string input_path = dir_prefix + std::string("tune_input.bin"); + std::string labels_path = dir_prefix + std::string("tune_labels.bin"); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void *conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); @@ -977,10 +976,14 @@ int main() { std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); void *dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); + void *input = readTrainedWeights(input_path.c_str(), 0, 100, 3, 224, 224); + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), 0, 100); __hpvm__init(); RootIn *args = static_cast<RootIn *>(malloc(sizeof(RootIn))); + args->input = input; + args->input_bytes = 0; args->conv2d_1_w = conv2d_1_w; args->conv2d_1_w_bytes = 0; args->conv2d_1_b = conv2d_1_b; @@ -1046,40 +1049,14 @@ int main() { args->dense_3_b = dense_3_b; args->dense_3_b_bytes = 0; - startMemTracking(); - startProfiling(); - - unsigned int batch_size = 50; - unsigned int test_input_size = 1000; - unsigned int batch_count = test_input_size / batch_size; - - for (int j = 0; j < 1; j++) { - for (int i = 0; i < batch_count; i++) { + void *dfg = __hpvm__launch(0, root, (void *)args); - int start = i * batch_size; - int end = (i + 1) * batch_size; + __hpvm__wait(dfg); - void *input = - readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + void *result = static_cast<RootIn *>(args)->r.tensor; + hpvm_request_tensor(result, 0); - args->input = input; - args->input_bytes = 0; - - void *dfg = __hpvm__launch(0, root, (void *)args); - - __hpvm__wait(dfg); - - void *result = static_cast<RootIn *>(args)->r.tensor; - hpvm_request_tensor(result, 0); - - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); - - freeBatchMemory(); - } - } - - stopProfiling(); __hpvm__cleanup(); - + computeAccuracy3(labels, result); return 0; } diff --git a/hpvm/test/dnn_benchmarks/scripts/run_dnn.py b/hpvm/test/dnn_benchmarks/scripts/run_dnn.py new file mode 100644 index 0000000000000000000000000000000000000000..2eed6739a76c7251ea60ed77df9730b0be9ac034 --- /dev/null +++ b/hpvm/test/dnn_benchmarks/scripts/run_dnn.py @@ -0,0 +1,212 @@ +import os.path +from os import path +import sys +#import matplotlib.pyplot as plt + + +binary_dir = "../../../build/tools/hpvm/test/dnn_benchmarks/" + +accuracy_file = "final_accuracy" +profile_file = "profile_data.txt" +profile_file_prefix = "profile_info_" + +temp_file_name = "temp.txt" +pred_binary_prefix = "test_" +pred_binary_suffix = "_pred" + +rt_binary_suffix = "_rt_pred" +max_num_runs = 20 + + +def max_num_configs (config_file): + num_configs = 0 + with open(config_file, "r") as f: + for line in f: + if "conf" in line: + num_configs = num_configs + 1 + return (num_configs + 1) + + +def read_and_write_config (config_file, config_num, temp_file): + config = "" + print("--CONFIG FILE: " + config_file) + print("--CONFIG NUM: " + str(config_num)) + print("--TEMP FILE: " + temp_file) + with open(config_file, "r") as f: + conf = "conf" + str(config_num) + read_config = False + read_first_line = False + for line in f: + if read_first_line == False: + config = config + line + read_first_line = True + continue + if "-----" in line and read_config == True: + read_config = False + config = config + line + break + if read_config == True: + config = config + line + continue + if conf in line: + read_config = True + config = config + "+++++\n" + config = config + line + print("config: ") + print(config) + with open(temp_file, "w") as f: + f.write(config) + + +def get_avg_exec_time(profile_file_path, config_num): + prof_file = profile_file_path + profile_file_prefix + str(config_num) + ".txt" + print("PROFILE FILE: " + prof_file) + with open(prof_file, "r") as f: + for line in f: + if "Total Time" in line: + print("LINE: " + line) + time = line.strip().split() [3] + print("TIME: " + time) + return float(time) + print("ERROR") + sys.exit() + return float(-1) + +def get_exec_time(config_file): + print("CONFIG FILE: " + config_file) + with open(config_file, "r") as f: + for line in f: + if "conf" in line: + print("LINE: " + line) + time = line.strip().split() [1] + print("TIME: " + time) + return float(time) + print("ERROR") + sys.exit() + return float(-1) + +def get_avg_exec_accuracy(file_name): + with open(file_name, "r") as f: + for line in f: + accuracy = line.strip().split() [0] + print("ACCURACY: " + accuracy) + return float(accuracy) + print("ERROR") + sys.exit() + return float(-1) + +def get_exec_accuracy(config_file): + with open(config_file, "r") as f: + for line in f: + if "conf" in line: + print("LINE: " + line) + acc = line.strip().split() [4] + print("ACCURACY: " + acc) + return float(acc) + print("ERROR") + sys.exit() + return float(-1) + +def predictive_tuning_exec(dnn_name): + #num_args = len(sys.argv) + #binary_files = list() + #arg = 2 + #while arg < num_args: + # binary_files.append(sys.argv[arg]) + # arg = arg + 1 + + #for dnn_name in binary_files: + dnn_dir = "../benchmarks/" + dnn_name + binary_name = binary_dir + pred_binary_prefix + dnn_name + pred_binary_suffix + pred_dir = dnn_dir + "/predictive/" + config_file = pred_dir + dnn_name + ".txt" + temp_file = pred_dir + temp_file_name + print("dnn_dir: " + dnn_dir) + print("binary name: " + binary_name) + print("pred_dir: " + pred_dir) + print("config_file: " + config_file) + print("temp_file: " + temp_file) + exec_command = "rm " + temp_file + " " + accuracy_file + " " + profile_file + " " + pred_dir + "profile*" + print(exec_command) + os.system(exec_command) + config_num = 1 + max_configs = max_num_configs(config_file) + baseline_time = 0 + baseline_acc = 0 + print("MAX CONFIGS: " + str(max_configs)) + perf_list = list() + acc_list = list() + while config_num < max_configs: + read_and_write_config(config_file, config_num, temp_file) + exec_command = binary_name + print(exec_command) + os.system(exec_command) + time = get_avg_exec_time(pred_dir, config_num - 1) + acc = get_avg_exec_accuracy(accuracy_file) + config_time = get_exec_time(temp_file) + config_acc = get_exec_accuracy(temp_file) + if config_num == 1: + baseline_time = time + baseline_acc = acc + else: + print("SPEEDUP: ") + print(baseline_time/time) + #time.append(baseline_time/time) + print("CONFIG TIME: ") + print(config_time) + print("ACC LOSS: ") + print(baseline_acc - acc) + #acc_list.append(baseline_acc - acc) + print("CONFIG ACC: ") + print(config_acc) + config_num = config_num + 1 + #plt.plot(perf_list, acc_list) + #plt.xlabel("Speedups") + #plt.ylabel("Accurancy loss") + #plt.savefig(pred_dir + "tradeoff.pdf") + #exec_command = "rm " + temp_file + " " + accuracy_file + " " + profile_file + " " + pred_dir + "profile*" + #print(exec_command) + #os.system(exec_command) + + +def runtime_tuning_exec(): + num_args = len(sys.argv) + binary_files = list() + arg = 2 + while arg < num_args: + binary_files.append(sys.argv[arg]) + arg = arg + 1 + + for dnn_name in binary_files: + binary_dir = "../benchmarks/" + dnn_name + binary_name = binary_dir + rt_binary_suffix + conf_dir = binary_dir + "/data" + print("binary_dir: " + binary_dir) + print("binary name: " + binary_name) + run = 0 + while run < max_num_runs: + exec_command = binary_name + print(exec_command) + os.system(exec_command) + exec_command = "/home/nvidia/poll 13" + print(exec_command) + os.system(exec_command) + exec_command = "mv " + conf_dir + "/profile_info_0.txt " + conf_dir + "/profile_info_out-run-" + str(run) + ".txt" + print(exec_command) + os.system(exec_command) + run = run + 1 + exec_command = "rm -rf " + conf_dir + "/run_data" + print(exec_command) + os.system(exec_command) + exec_command = "mkdir " + conf_dir + "/run_data" + print(exec_command) + os.system(exec_command) + + + +if __name__ == "__main__": + if sys.argv[1] == "--runtime_tuning": + runtime_tuning_exec() + else: + predictive_tuning_exec(sys.argv[1]) + diff --git a/hpvm/test/dnn_benchmarks/scripts/run_dnns.py b/hpvm/test/dnn_benchmarks/scripts/run_dnns.py new file mode 100644 index 0000000000000000000000000000000000000000..0de85c7847309532db985d95aedbba02f2715059 --- /dev/null +++ b/hpvm/test/dnn_benchmarks/scripts/run_dnns.py @@ -0,0 +1,17 @@ +import os +import sys + +dnns = ["alexnet", "alexnet2", "vgg16_cifar10", "vgg16_cifar100", "resnet18", "mobilenet_cifar10", "alexnet_imagenet", "resnet50_imagenet", "vgg16_imagenet", "lenet_mnist"] +#dnns = ["resnet50_imagenet","alexnet"] + +#if sys.argv[1] == "--runtime": +# exec_command = "python3 run_dnn.py" + " --runtime_tuning " + dnns +# print(exec_command) +# os.system(exec_command) +#else: +if __name__ == "__main__": + for dnn in dnns: + exec_command = "python3 run_dnn.py " + dnn + print(exec_command) + os.system(exec_command) +