diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..f7d3d37cde7f947d3d7d7f4f9d8d7879b60e33e6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "hpvm/projects/predtuner"] + path = hpvm/projects/predtuner + url = ../predtuner.git + branch = hpvm diff --git a/README.md b/README.md index 4280373aa5f5c7d239ee18989edf1f6219c360a1..6b185aab532fb00e94dcad3d735954e61b0a883f 100644 --- a/README.md +++ b/README.md @@ -15,8 +15,8 @@ HPVM is currently at **version 1.0**. For more about what HPVM is, see [our webs [PPoPP'21 paper](https://dl.acm.org/doi/10.1145/3437801.3446108) +## Resources -## Docs [HPVM IR Specification](/hpvm/docs/hpvm-specification.md) [HPVM-C Language Specification](/hpvm/docs/hpvm-c.md) @@ -24,6 +24,7 @@ HPVM is currently at **version 1.0**. For more about what HPVM is, see [our webs [HPVM Compilation Process](/hpvm/docs/compilation.md) ## Dependencies + The following components are required to be installed on your machine to build HPVM. * GCC (>=5.1) @@ -36,29 +37,39 @@ The following components are required to be installed on your machine to build H * CUDA (>=9.1) ## Supported Targets + Supported/tested CPU architectures: + * Intel Xeon E5-2640 * Intel Xeon W-2135 * ARM Cortex A-57 Supported/tested GPU architectures for OpenCL backend: + * Nvidia Quadro P1000 * Nvidia GeForce GTX 1080 Supported/tested GPU architectures for Tensor Backend: -* Nvidia Jetson TX2 + +* Nvidia Jetson TX2 * Nvidia GeForce GTX 1080 HPVM has not been tested but might work on other CPUs supported by LLVM Backend, and GPUs supported by OpenCL such as Intel, AMD, etc. -**NOTE: Approximations are tuned for Jetson TX2 and same speedups may not exist for other architectures ** +**NOTE**: Approximations are tuned for Jetson TX2 and same speedups may not exist for other architectures. + +## Getting Started + +### Getting source code and setting up environment -## Getting source code and building HPVM +Checkout HPVM and go to directory `./hpvm` under project root: -Checkout HPVM: ```shell -git clone --recursive https://gitlab.engr.illinois.edu/llvm/hpvm-release.git/ -cd hpvm-release/hpvm +git clone --recursive https://gitlab.engr.illinois.edu/llvm/hpvm.git +cd hpvm/ +git checkout approx_hpvm_reorg_keras +git submodule update --init --recursive +cd hpvm/ ``` HPVM needs to be able to find CUDA. @@ -68,58 +79,77 @@ Otherwise, some environment variables are required: * `CUDA_TOOLKIT_PATH` --- Path to the CUDA toolkit * `CUDA_INCLUDE_PATH` --- Path to the CUDA headers -* `CUDA_LIB_PATH` --- Path to CUDA libraries +* `CUDA_LIB_PATH` --- Path to CUDA libraries + +`set_paths.sh` can be used for this. +Modify the values of these variables in `set_paths.sh` according to your system, and source the script: -`hpvm/set_paths.sh` can be used for this. Modify the values of these variables in set_paths.sh and source the script: ```shell source set_paths.sh ``` -HPVM installer script can be used to download, configure and build HPVM along with LLVM and Clang. +HPVM installer script can be used to download, configure and build HPVM along with LLVM and Clang. + ```shell bash install.sh ``` -Specifically, the HPVM installer downloads LLVM, and Clang, copies HPVM source into -llvm/tools and builds the entire tree. It also builds a modified LLVM C-Backend, based on the one maintained by [Julia Computing](https://github.com/JuliaComputing/llvm-cbe), as a part of HPVM and is currently used -to generate OpenCL kernels for GPUs. -In the beginning of the building process, the installer provides users the choice of automatically or manually building HPVM. -If HPVM is selected to be built automatically, the installer allows users to type in the number of threads they want to use. -The default number of threads used to build HPVM is two. +On launch, the installer asks whether it should also build HPVM. +If HPVM is to be built, the installer asks the number of threads to be used. +The default number of threads used to build HPVM is two (2). + +If you use this automatic build, skip the next section. + +* Specifically, the HPVM installer downloads LLVM, and Clang, copies HPVM source into + llvm/tools and builds the entire tree. It also builds a modified LLVM C-Backend, + based on the one maintained by [Julia Computing](https://github.com/JuliaComputing/llvm-cbe), + as a part of HPVM and is currently used to generate OpenCL kernels for GPUs. + +### Manually Build HPVM + +Alternatively, you can manually build HPVM with CMake. +Please note that in this case, +the installer script still *must* be executed to obtain some required components, +but without the build step. + +In current directory (`hpvm/`), do -Alternatively, CMake can be run manually using the following steps in ./hpvm-release/hpvm directory. ```shell mkdir build cd build cmake ../llvm [options] +export PATH=$(realpath ./bin):$PATH ``` -**Note** that if the installer script was not used, -you must _manually add `build/bin` directory to your $PATH variable_ (as absolute path). Some common options that can be used with CMake are: * -DCMAKE_INSTALL_PREFIX=directory --- Specify for directory the full pathname of where you want the HPVM tools and libraries to be installed. - * -DCMAKE_BUILD_TYPE=type --- Valid options for type are Debug, Release, RelWithDebInfo, and MinSizeRel. Default is Debug. - * -DLLVM_ENABLE_ASSERTIONS=On --- Compile with assertion checks enabled (default is Yes for Debug builds, No for all other build types). -In order to manually build and install HPVM, GNU Make can be run using the following in the build directory. +**Note** that if the installer script was not used, +you must _manually add `build/bin` directory to your $PATH variable_ as absolute path (as shown above). + +Now, compile the HPVM Compilation Tool `approxhpvm.py` using: + ```shell -make -j<number of threads> -make install +make -j<number of threads> approxhpvm.py ``` -In the end of the installation process, the installer automatically runs all the regression tests to ensure that the installation is -successful. If HPVM is built and installed manually, the tests can be automatically run by executing the following step from the ./hpvm-release/hpvm directory. +With all the aforementioned steps, HPVM should be built, installed, tested and ready to use. +In particular, `approxhpvm.py` should be an executable command from your command line. + +When not using the installer, you may want to run the regression tests using this script (outside of build directory): + ```shell +cd .. bash scripts/automate_tests.sh ``` -With all the aforementioned steps, HPVM should be built, installed, tested and ready to use. - ## Benchmarks and Tests + We are providing the following [HPVM benchmarks](/hpvm/test/benchmarks): + * Select benchmarks from the [Parboil](http://impact.crhc.illinois.edu/parboil/parboil.aspx) benchmark suite, located under [test/benchmarks/parboil](/hpvm/test/benchmarks/parboil). * An edge detection pipeline benchmark, located under [test/benchmarks/pipeline](/hpvm/test/benchmarks/pipeline). * A Camera ISP pipeline, located under [test/benchmarks/hpvm-cava](/hpvm/test/benchmarks/hpvm-cava), adapted from C code provided from our collaborators at [Harvard](http://vlsiarch.eecs.harvard.edu). @@ -129,4 +159,5 @@ Benchmark descriptions and instructions on how to compile and run them are [here We are also providing [unit tests](/hpvm/test/unitTests) and [regression tests](/hpvm/test/regressionTests). ## Support + All questions can be directed to [hpvm-dev@lists.cs.illinois.edu](mailto:hpvm-dev@lists.cs.illinois.edu). diff --git a/hpvm/CMakeLists.txt b/hpvm/CMakeLists.txt index d63675b34275c3f83c10ca83005bbfe563777554..b6985d0a100f38a7712580a30d3ba91e59dd248c 100644 --- a/hpvm/CMakeLists.txt +++ b/hpvm/CMakeLists.txt @@ -1,3 +1,13 @@ +cmake_minimum_required(VERSION 3.17) +project(hpvm CUDA CXX) +get_filename_component( + CUDA_TOOLKIT_ROOT_DIR "${CMAKE_CUDA_COMPILER}/../.." ABSOLUTE +) # Set CUDA_TOOLKIT_ROOT_DIR by our own, to the parent folder of cuda nvcc + +# find_package will use the auxillary cmake/Find*.cmake we provide +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +find_package(CUDNN 7 EXACT REQUIRED) # CUDNN_INCLUDE_PATH, CUDNN_LIBRARY_PATH + include_directories(./include/) # Generate TENSOR_RT_PREFIX into config.h diff --git a/hpvm/cmake/FindCUDNN.cmake b/hpvm/cmake/FindCUDNN.cmake new file mode 100644 index 0000000000000000000000000000000000000000..e5a427f0317a6f3b8f7e7b2cc89fd176fd4362dc --- /dev/null +++ b/hpvm/cmake/FindCUDNN.cmake @@ -0,0 +1,83 @@ +# Obtained from PyTorch repo: https://github.com/pytorch/pytorch/blob/master/cmake/Modules_CUDA_fix/FindCUDNN.cmake +# Find the CUDNN libraries +# +# The following variables are optionally searched for defaults +# CUDNN_ROOT: Base directory where CUDNN is found +# CUDNN_INCLUDE_DIR: Directory where CUDNN header is searched for +# CUDNN_LIBRARY: Directory where CUDNN library is searched for +# CUDNN_STATIC: Are we looking for a static library? (default: no) +# +# The following are set after configuration is done: +# CUDNN_FOUND +# CUDNN_INCLUDE_PATH +# CUDNN_LIBRARY_PATH +# + +include(FindPackageHandleStandardArgs) + +set(CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} CACHE PATH "Folder containing NVIDIA cuDNN") +if (DEFINED $ENV{CUDNN_ROOT_DIR}) + message(WARNING "CUDNN_ROOT_DIR is deprecated. Please set CUDNN_ROOT instead.") +endif() +list(APPEND CUDNN_ROOT $ENV{CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) + +# Compatible layer for CMake <3.12. CUDNN_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. +list(APPEND CMAKE_PREFIX_PATH ${CUDNN_ROOT}) + +set(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR} CACHE PATH "Folder containing NVIDIA cuDNN header files") + +find_path(CUDNN_INCLUDE_PATH cudnn.h + HINTS ${CUDNN_INCLUDE_DIR} + PATH_SUFFIXES cuda/include cuda include) + +option(CUDNN_STATIC "Look for static CUDNN" OFF) +if (CUDNN_STATIC) + set(CUDNN_LIBNAME "libcudnn_static.a") +else() + set(CUDNN_LIBNAME "cudnn") +endif() + +set(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY} CACHE PATH "Path to the cudnn library file (e.g., libcudnn.so)") +if (CUDNN_LIBRARY MATCHES ".*cudnn_static.a" AND NOT CUDNN_STATIC) + message(WARNING "CUDNN_LIBRARY points to a static library (${CUDNN_LIBRARY}) but CUDNN_STATIC is OFF.") +endif() + +find_library(CUDNN_LIBRARY_PATH ${CUDNN_LIBNAME} + PATHS ${CUDNN_LIBRARY} + PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64) +# Get director from filename ${CUDNN_LIBRARY_PATH} +get_filename_component( + CUDNN_LIBRARY_PATH + "${CUDNN_LIBRARY_PATH}/.." ABSOLUTE +) + +# This version check is from OpenCV repo: https://github.com/opencv/opencv/blob/master/cmake/FindCUDNN.cmake +# extract version from the include +if(CUDNN_INCLUDE_PATH) + if(EXISTS "${CUDNN_INCLUDE_PATH}/cudnn_version.h") + file(READ "${CUDNN_INCLUDE_PATH}/cudnn_version.h" CUDNN_H_CONTENTS) + else() + file(READ "${CUDNN_INCLUDE_PATH}/cudnn.h" CUDNN_H_CONTENTS) + endif() + + string(REGEX MATCH "define CUDNN_MAJOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}") + set(CUDNN_VERSION_MAJOR ${CMAKE_MATCH_1}) + string(REGEX MATCH "define CUDNN_MINOR ([0-9]+)" _ "${CUDNN_H_CONTENTS}") + set(CUDNN_VERSION_MINOR ${CMAKE_MATCH_1}) + string(REGEX MATCH "define CUDNN_PATCHLEVEL ([0-9]+)" _ "${CUDNN_H_CONTENTS}") + set(CUDNN_VERSION_PATCH ${CMAKE_MATCH_1}) + + set(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}") + unset(CUDNN_H_CONTENTS) +endif() + +find_package_handle_standard_args( + CUDNN + FOUND_VAR CUDNN_FOUND + REQUIRED_VARS + CUDNN_LIBRARY_PATH + CUDNN_INCLUDE_PATH + VERSION_VAR CUDNN_VERSION +) + +mark_as_advanced(CUDNN_ROOT CUDNN_INCLUDE_DIR CUDNN_LIBRARY) diff --git a/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt index b4e129ba01837cf328912f7787b861f843f4f581..83ec877b0675f0b2a841e24d15126932c812bbd9 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt +++ b/hpvm/lib/Transforms/DFG2LLVM_CPU/CMakeLists.txt @@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN) set(LLVM_LINK_COMPONENTS Core Support) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}") add_llvm_library( LLVMDFG2LLVM_CPU MODULE diff --git a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt index 00c651eaa250fc114f229f30e0cb7c121154ff96..4041df11ce8d79e39d6f72bdf0a1068eae449300 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt +++ b/hpvm/lib/Transforms/DFG2LLVM_OpenCL/CMakeLists.txt @@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN) set(LLVM_LINK_COMPONENTS Core Support) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}") add_llvm_library( LLVMDFG2LLVM_OpenCL MODULE diff --git a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt index fc4c9fc5a98007dd700973c598b6731edcd61e14..fbf5881480ce11745b0d4de00b90c0812a6db356 100644 --- a/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt +++ b/hpvm/lib/Transforms/GenHPVM/CMakeLists.txt @@ -2,7 +2,7 @@ if(WIN32 OR CYGWIN) set(LLVM_LINK_COMPONENTS Core Support) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${PROJECT_BINARY_DIR}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_BUILD_DIR=${CMAKE_BINARY_DIR}") add_llvm_library( LLVMGenHPVM MODULE diff --git a/hpvm/projects/CMakeLists.txt b/hpvm/projects/CMakeLists.txt index b46164b8d07de77ba9feb570b976e19ae9fdf4b2..2a51c0b09e672e8508a8a13d189d05eb3ccc2e48 100644 --- a/hpvm/projects/CMakeLists.txt +++ b/hpvm/projects/CMakeLists.txt @@ -10,7 +10,6 @@ foreach(entry ${entries}) (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests)) - set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR}) get_filename_component(entry_name "${entry}" NAME) add_llvm_external_project(${entry_name}) endif() diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt index 02ab62fca57f66155ffafff0686634b3efe4f861..6efd8d3d0a9d86236adc87657fb68b782f3daaa0 100644 --- a/hpvm/projects/hpvm-rt/CMakeLists.txt +++ b/hpvm/projects/hpvm-rt/CMakeLists.txt @@ -1,7 +1,7 @@ add_definitions(-DNUM_CORES=8) -SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang) -SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++) +SET(CMAKE_C_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang) +SET(CMAKE_CXX_COMPILER ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/clang++) SET(CMAKE_CXX_STANDARD 11) # Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY} if found find_package(OpenCL REQUIRED) diff --git a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt index 2f8cfc27e5280e7d18a830cc6083841a2cc3590b..5c04604406eb81571c0a87539fb0568aad3c4e4d 100644 --- a/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/hpvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -1,50 +1,15 @@ -cmake_minimum_required(VERSION 3.17) -project(hpvm-tensor-rt) -find_package(CUDA 9.1 REQUIRED) -set(CUDA_SEPARABLE_COMPILATION ON CACHE BOOL "") -set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - # gcc > 8 are not supported - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8) - message(FATAL_ERROR "GCC versions later than 8 are not supported") - endif() -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - # clang < 3.2 || clang >= 9 unsupported - set(clang_v ${CMAKE_CXX_COMPILER_VERSION}) - if (clang_v VERSION_GREATER_EQUAL 9 OR clang_v VERSION_LESS_EQUAL 3.2) - message(FATAL_ERROR "Clang<3.2 or clang>=9 are not supported") - endif() -endif() -# Addresses a bug where code is not compiled as C++11 in non-CUDA code and older g++ versions -# Edit: using c++14 now +project(hpvm-tensor-rt CUDA CXX) set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -I/") -set( - CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; - -gencode;arch=compute_60,code=sm_60; - -gencode;arch=compute_60,code=compute_60; - -std=c++14 --expt-relaxed-constexpr -maxrregcount 32 # These are for image ops -) -if(CMAKE_BUILD_TYPE STREQUAL "Debug") - message("Debug mode") - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-g;-lineinfo;-Xcompiler;-ggdb) -else() - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-DNDEBUG;-Xcompiler;-DNDEBUG) -endif() -# Default options -if(USE_GFLAGS) - add_definitions(-DUSE_GFLAGS) -endif() -if(USE_AUTOTUNER) - remove_definitions(-DNO_INJECTION) +if(CMAKE_CURRENT_BINARY_DIR STREQUAL CMAKE_SOURCE_DIR) # This means we're NOT compiling in HPVM + set(INDEP_BUILD True) + message(STATUS "Compiling hpvm-tensor-rt independently") +else() + set(INDEP_BUILD False) + message(STATUS "Compiling hpvm-tensor-rt inside HPVM") endif() -add_definitions(-DNO_INJECTION) -add_definitions(-DPROMISE_TUNER_ENABLED) -add_definitions(-DSIMULATION_MODE=true) -# Config path configuration file +# -- Configure path configuration file if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/global_knobs.txt) message(FATAL_ERROR "global_knobs.txt not found") endif() @@ -56,202 +21,176 @@ configure_file( ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include/config.h ) -# Default include/link directories +# -- Default include directories set( INCLUDES - $ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/include - ${CUDA_INCLUDE_DIRS} + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CUDNN_INCLUDE_PATH} ./tensor_runtime/include ${CMAKE_CURRENT_BINARY_DIR}/tensor_runtime/include ./dnn_sources/include ../gpu_profiler/include ../soc_simulator/include ) -set( - LINK_DIRS - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} - $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64 -) -include_directories(${INCLUDES}) -link_directories(${LINK_DIRS}) -# Source files of runtime +# -- Link libraries +find_package(OpenMP REQUIRED) # Provides ${OpenMP_CXX_FLAGS} +# Configure gpu_profiler and soc_simulator, and setup all libs to link to +# Conditionally add gpu_profiler project if we're building independently +# (not building the whole hpvm) +if(INDEP_BUILD) + message(STATUS "Also compiling gpu_profiler and soc_simulator") + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../gpu_profiler ${CMAKE_CURRENT_BINARY_DIR}/gpu_profiler) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../soc_simulator ${CMAKE_CURRENT_BINARY_DIR}/soc_simulator) +endif() +set(LINK_DIR CUDNN_LIBRARY_PATH) +set(LINK_LIBS gpu_profiler promise_profiler stdc++fs cudnn curand cublas) +if(USE_GFLAGS) + list(APPEND LINK_LIBS gflags) +endif() + +# -- Definitions +set(DEFS -DPROMISE_TUNER_ENABLED -DSIMULATION_MODE=true) +if(USE_GFLAGS) + list(APPEND DEFS -DUSE_GFLAGS) +endif() + +# -- Sources of runtime set( RUNTIME_SRCS_FILENAME - approx_simulation.cu - group_conv.cu - approx_techniques.cu - common.cpp + approx_knobs_utils.cc approx_simulation.cu approx_techniques.cu configuration.cpp - debug.cc - debug.cpp - device_math.cu + debug.cpp device_math.cu error.cu - tensor_cpu_runtime.cc - fp16_gemm.cu - global_data.cc - half_precision_api.cu - hpvm-rt-controller.cpp - img_tensor_runtime.cu - img_tensor_utils.cpp + fp16_gemm.cu freq_utils.cc + global_data.cc group_conv.cu + half_precision_api.cu hpvm-rt-controller.cpp + init_api.cc op_overheads.cc profiling.cc - tensor_runtime.cu - tensor_utils.cu + tensor_cpu_runtime.cc tensor_runtime.cu tensor_utils.cu wrapper_runtime.cu - approx_knobs_utils.cc - init_api.cc ) foreach(FILE ${RUNTIME_SRCS_FILENAME}) list(APPEND RUNTIME_SRCS "tensor_runtime/src/${FILE}") + # Some files doesn't end in .cu or .cuh, but we know they are still CUDA files + set_source_files_properties("tensor_runtime/src/${FILE}" PROPERTIES LANGUAGE CUDA) endforeach() -# Compile gpu_profiler and soc_simulator -# Conditionally add gpu_profiler project if we're building independently -# (not building the whole hpvm) -get_filename_component(root_dir ${CMAKE_SOURCE_DIR} REALPATH) -get_filename_component(our_dir ${CMAKE_CURRENT_SOURCE_DIR} REALPATH) -if(${root_dir} STREQUAL ${our_dir}) - message(STATUS "Compiling hpvm-tensor-rt independently") - message(STATUS "Also compiling gpu_profiler and soc_simulator") - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../gpu_profiler ${CMAKE_CURRENT_BINARY_DIR}/gpu_profiler) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../soc_simulator ${CMAKE_CURRENT_BINARY_DIR}/soc_simulator) -endif() -set(LINK_LIBS gpu_profiler promise_profiler cudnn cufft stdc++fs curand) -if(USE_GFLAGS) - list(APPEND LINK_LIBS gflags) -endif() +# -- Adding tensor_runtime targets +function(add_tensor_runtime target_name) + add_library(${target_name} ${RUNTIME_SRCS}) + set_property(TARGET ${target_name} PROPERTY CUDA_ARCHITECTURES 60) + target_compile_options( + ${target_name} PRIVATE + $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr -maxrregcount 32> + $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:DEBUG>>:-lineinfo -Xcompiler -ggdb> + $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}> + ) + target_include_directories(${target_name} PUBLIC ${INCLUDES}) + target_link_directories(${target_name} PUBLIC ${LINK_DIR}) + target_link_libraries(${target_name} PUBLIC ${LINK_LIBS}) + target_compile_definitions(${target_name} PRIVATE ${DEFS} ${ARGN}) +endfunction(add_tensor_runtime) # Adding new rule for building a cuDNN runtime library # Offline version -find_package(OpenMP REQUIRED) -cuda_add_library(tensor_runtime ${RUNTIME_SRCS}) -cuda_add_cublas_to_target(tensor_runtime ${OpenMP_CXX_FLAGS}) -target_compile_options(tensor_runtime PRIVATE ${OpenMP_CXX_FLAGS}) -target_link_libraries(tensor_runtime ${LINK_LIBS} ${OpenMP_CXX_FLAGS}) -target_compile_definitions(tensor_runtime PRIVATE -DONLINE_PROFILING=false -DFP16_tuning=true) +add_tensor_runtime(tensor_runtime -DONLINE_PROFILING=false -DFP16_tuning=true) -if(LLVM_BUILD_DIR) # Defined in ../CMakeLists.txt. This means we're compiling in LLVM - get_filename_component(LLVM_CLANG_XX ${LLVM_BUILD_DIR}/bin/clang++ REALPATH) - # It's important that tensor_runtime.ll goes here if we're compiling with LLVM - # Some HPVM passes look for tensor_runtime.ll in this folder (which is usually build/lib) - set(TENSOR_RT_LL_PREFIX ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) - add_dependencies(tensor_runtime clang) -else() - # Surely if we're compiling outside of hpvm, then we need the system-wide clang. - # Use it but check version 9 first - execute_process(COMMAND clang++ --version OUTPUT_VARIABLE clang_full_version_string ERROR_QUIET) - string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" CLANG_VERSION_STRING ${clang_full_version_string}) - if(CLANG_VERSION_STRING VERSION_EQUAL 9) - set(LLVM_CLANG_XX clang++) +# Online version +add_tensor_runtime(tensor_runtime_online -DONLINE_PROFILING=true -DFP16_tuning=false) +# tensor_runtime_online is built AFTER tensor_runtime because of a nvcc bug (bug?) +# that doesn't allow compiling the same file from multiple targets at once. +# Same for tensor_runtime_online. +add_dependencies(tensor_runtime_online tensor_runtime) + +# Adding rule for the debugging source +add_executable(unit_tests dnn_sources/src/unit_tests.cc) +target_link_libraries(unit_tests tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) + +# -- Compile tensor_runtime.ll if possible +if(INDEP_BUILD) + # Surely if we're compiling outside of hpvm, then we need the system-wide clang -- a clang 9. + execute_process(COMMAND clang-9 --version OUTPUT_VARIABLE clang_stdout ERROR_QUIET) + if(clang_stdout) set(TENSOR_RT_LL_PREFIX ${CMAKE_CURRENT_SOURCE_DIR}/lib) else() message(WARNING "System clang++ of version 9 not found; skipping tensor_runtime.ll generation") endif() + set(CLANG_NAME clang-9) +else() + # It's important that tensor_runtime.ll goes here if we're compiling with LLVM + # Some HPVM passes look for tensor_runtime.ll in this folder (which is usually build/lib) + set(TENSOR_RT_LL_PREFIX ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) + # Per cmake documentation, if we're building in LLVM, then in add_custom_command + # the command "clang" will be auto resolved to the path to clang we're building + set(CLANG_NAME clang) + add_dependencies(tensor_runtime clang) endif() # If some clang-9 is found, create a tensor_runtime.ll from tensor_signatures.cc -if(LLVM_CLANG_XX) +if(CLANG_NAME) message(STATUS "Creating tensor_runtime.ll in ${TENSOR_RT_LL_PREFIX}") foreach(dir ${INCLUDES}) list(APPEND INCLUDE_COMPILER_STRINGS "-I${dir}") endforeach() add_custom_command( TARGET tensor_runtime POST_BUILD - COMMAND ${LLVM_CLANG_XX} ${INCLUDE_COMPILER_STRINGS} -S -emit-llvm + COMMAND ${CLANG_NAME} -x c++ ${INCLUDE_COMPILER_STRINGS} -S -emit-llvm ${CMAKE_CURRENT_SOURCE_DIR}/tensor_runtime/include/tensor_signatures.cc -o ${TENSOR_RT_LL_PREFIX}/tensor_runtime.ll ) endif() -# Install version (also offline) -cuda_add_library(tensor_runtime_install ${RUNTIME_SRCS}) -cuda_add_cublas_to_target(tensor_runtime_install) -# tensor_runtime_install is built AFTER tensor_runtime because of a nvcc bug (bug?) -# that doesn't allow compiling the same file from multiple targets at once. -# Same for tensor_runtime_online. -add_dependencies(tensor_runtime_install tensor_runtime) -target_link_libraries(tensor_runtime_install ${LINK_LIBS}) -target_compile_definitions(tensor_runtime_install PRIVATE -DONLINE_PROFILING=false -DFP16_tuning=true) - -# Online version -cuda_add_library(tensor_runtime_online ${RUNTIME_SRCS}) -cuda_add_cublas_to_target(tensor_runtime_online ${OpenMP_CXX_FLAGS}) -target_compile_options(tensor_runtime_online PRIVATE ${OpenMP_CXX_FLAGS}) -add_dependencies(tensor_runtime_online tensor_runtime) -target_link_libraries(tensor_runtime_online ${LINK_LIBS} ${OpenMP_CXX_FLAGS}) -target_compile_definitions(tensor_runtime_online PRIVATE -DONLINE_PROFILING=true -DFP16_tuning=false) - - - -# -------------- Unit Test Source ---------------- - -add_executable(unit_tests dnn_sources/src/unit_tests.cc) -target_link_libraries(unit_tests tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - #**************** FP32 TensorRT Source Builds *********** -add_executable(lenet_mnist_fp32 dnn_sources/src/fp32/lenet_mnist.cc) -target_link_libraries(lenet_mnist_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(lenet_mnist_fp32 dnn_sources/src/fp32/lenet_mnist.cc) +target_link_libraries(lenet_mnist_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(alexnet_cifar10_fp32 dnn_sources/src/fp32/alexnet_cifar10.cc) -target_link_libraries(alexnet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(alexnet_cifar10_fp32 dnn_sources/src/fp32/alexnet_cifar10.cc) +target_link_libraries(alexnet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(alexnet2_cifar10_fp32 dnn_sources/src/fp32/alexnet2_cifar10.cc) -target_link_libraries(alexnet2_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(alexnet2_cifar10_fp32 dnn_sources/src/fp32/alexnet2_cifar10.cc) +target_link_libraries(alexnet2_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(vgg16_cifar10_fp32 dnn_sources/src/fp32/vgg16_cifar10.cc) -target_link_libraries(vgg16_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(vgg16_cifar10_fp32 dnn_sources/src/fp32/vgg16_cifar10.cc) +target_link_libraries(vgg16_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(resnet18_cifar10_fp32 dnn_sources/src/fp32/resnet18_cifar10.cc) -target_link_libraries(resnet18_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(resnet18_cifar10_fp32 dnn_sources/src/fp32/resnet18_cifar10.cc) +target_link_libraries(resnet18_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(vgg16_cifar100_fp32 dnn_sources/src/fp32/vgg16_cifar100.cc) -target_link_libraries(vgg16_cifar100_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(vgg16_cifar100_fp32 dnn_sources/src/fp32/vgg16_cifar100.cc) +target_link_libraries(vgg16_cifar100_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(mobilenet_cifar10_fp32 dnn_sources/src/fp32/mobilenet.cc) -target_link_libraries(mobilenet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - -add_executable(alexnet_imagenet_fp32 dnn_sources/src/fp32/alexnet_imagenet.cc) -target_link_libraries(alexnet_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - -add_executable(vgg16_imagenet_fp32 dnn_sources/src/fp32/vgg16_imagenet.cc) -target_link_libraries(vgg16_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - -add_executable(resnet50_imagenet_fp32 dnn_sources/src/fp32/resnet50_imagenet.cc) -target_link_libraries(resnet50_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(mobilenet_cifar10_fp32 dnn_sources/src/fp32/mobilenet.cc) +target_link_libraries(mobilenet_cifar10_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(alexnet_imagenet_fp32 dnn_sources/src/fp32/alexnet_imagenet.cc) +target_link_libraries(alexnet_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(vgg16_imagenet_fp32 dnn_sources/src/fp32/vgg16_imagenet.cc) +target_link_libraries(vgg16_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(resnet50_imagenet_fp32 dnn_sources/src/fp32/resnet50_imagenet.cc) +target_link_libraries(resnet50_imagenet_fp32 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) #********* FP16 TensorRT Source Builds ****** -add_executable(lenet_mnist_fp16 dnn_sources/src/fp16/lenet_mnist_half.cc) -target_link_libraries(lenet_mnist_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - -add_executable(alexnet_cifar10_fp16 dnn_sources/src/fp16/alexnet_cifar10_half.cc) -target_link_libraries(alexnet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - -add_executable(alexnet2_cifar10_fp16 dnn_sources/src/fp16/alexnet2_cifar10_half.cc) -target_link_libraries(alexnet2_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(lenet_mnist_fp16 dnn_sources/src/fp16/lenet_mnist_half.cc) +target_link_libraries(lenet_mnist_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(resnet18_cifar10_fp16 dnn_sources/src/fp16/resnet18_cifar10_half.cc) -target_link_libraries(resnet18_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(alexnet_cifar10_fp16 dnn_sources/src/fp16/alexnet_cifar10_half.cc) +target_link_libraries(alexnet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(vgg16_cifar10_fp16 dnn_sources/src/fp16/vgg16_cifar10_half.cc) -target_link_libraries(vgg16_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(alexnet2_cifar10_fp16 dnn_sources/src/fp16/alexnet2_cifar10_half.cc) +target_link_libraries(alexnet2_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(vgg16_cifar100_fp16 dnn_sources/src/fp16/vgg16_cifar100_half.cc) -target_link_libraries(vgg16_cifar100_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(resnet18_cifar10_fp16 dnn_sources/src/fp16/resnet18_cifar10_half.cc) +target_link_libraries(resnet18_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -add_executable(mobilenet_cifar10_fp16 dnn_sources/src/fp16/mobilenet_half.cc) -target_link_libraries(mobilenet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(vgg16_cifar10_fp16 dnn_sources/src/fp16/vgg16_cifar10_half.cc) +target_link_libraries(vgg16_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) +add_executable(vgg16_cifar100_fp16 dnn_sources/src/fp16/vgg16_cifar100_half.cc) +target_link_libraries(vgg16_cifar100_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) - - -file(GLOB files "dnn_sources/src/dynamic/*.cpp") -foreach(file ${files}) - get_filename_component(stem ${file} NAME_WE) - add_executable(${stem} ${file}) - target_link_libraries(${stem} tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) -endforeach() - +add_executable(mobilenet_cifar10_fp16 dnn_sources/src/fp16/mobilenet_half.cc) +target_link_libraries(mobilenet_cifar10_fp16 tensor_runtime_online ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB}) diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h deleted file mode 100644 index 4eaf88e6d613c51a5a75ef8ce73b55a3410f1dbd..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h +++ /dev/null @@ -1,148 +0,0 @@ - - -#ifndef OP_OVERHEADS_HEADER -#define OP_OVERHEADS_HEADER - - -#include <sstream> -#include "../../tensor_runtime/include/tensor.h" -#include "types.h" - - -float scale_down_factor = 10000.0; -float error_factor = 0.1; -std::string result_str = ""; - - -// TODO: Every routine needs testing - - -// private function -static float getScaledComps(double total_comps, int error_scale){ - - total_comps = total_comps / scale_down_factor; - float comp_scale = 1.0 + (error_factor * error_scale); - total_comps = total_comps / comp_scale; - - return total_comps; -} - - -static void addNormToResult(float comps){ - - std::ostringstream ss; - ss << std::fixed << comps; - - result_str.append( std::string(ss.str()) ); - result_str.append("\t"); -} - - - -static void addCompsToResult(float comps){ - - std::ostringstream ss; - ss << std::fixed << comps; - - result_str.append( std::string(ss.str()) ); - result_str.append("\n"); -} - - -void add_conv_overheads(void* input_ptr, void* filter_ptr, - int strideA, int strideB, int error_scale){ - - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; - -} - - -void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){ - - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - - int m = lhs->dims.dim_sizes[0]; - // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons - int k = 1; - - // Flattening the dimensions after the batch dimension - for (int j = 1 ; j < lhs->dims.num_dims; j++){ - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; - // Dimension-note: Check if k is same across the two tensors - printf("m = %d, n = %d, k = %d \n", m, n, k); - - if(rhs_k != k){ - printf("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); - abort(); - } - - double total_comps = m * n * rhs_k * 1.0; - float scaled_comps = getScaledComps(total_comps, error_scale); - - printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); - - addCompsToResult(scaled_comps); - -} - - -void add_bias_overheads(void* input_ptr, int error_scale){ - - Tensor* input = (Tensor*) input_ptr; - - double total_comps = input->num_elems; - float scaled_comps = getScaledComps(total_comps, error_scale); - - printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); - - addCompsToResult(scaled_comps); - -} - - -void add_relu_overheads(void* input_ptr, int error_scale){ - - Tensor* input = (Tensor*) input_ptr; - - double total_comps = input->num_elems; - float scaled_comps = getScaledComps(total_comps, error_scale); - - printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); - - addCompsToResult(scaled_comps); - -} - -float add_pool_overheads(void* input_ptr, int kernel_size, - int stride_size, int error_scale){ - -} - - -void add_norms(void* norms_ptr){ - - Norm_t* norms = (Norm_t*) norms_ptr; - - addNormToResult(norms->l1_norm); - addNormToResult(norms->l2_norm); - addNormToResult(norms->inf_norm); - -} - -void dump_result(char* file_name){ - - FILE* fp = fopen(file_name, "w+"); - fwrite(result_str.c_str(), 1, result_str.length(), fp); - fclose(fp); -} - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h deleted file mode 100644 index 3e4f64610da64fb04b6270035da8557e940eb7e2..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h +++ /dev/null @@ -1,39 +0,0 @@ - -#ifndef TYPES_HEADER -#define TYPES_HEADER - -/* -struct Dimension_t{ - int num_dims; - size_t* dim_sizes; -}; - - -struct Tensor_t{ - int tensor_id; // used for indexing (in the tensor runtime) - int data_type; // {float_type, double_type, half_type, int_type} - int data_format; // {nchw, nhwc} - void* host_data; - size_t num_elems; // Total elements - size_t size_in_bytes; // Total size in bytes - struct Dimension_t dims; -}; - - - -enum Tensor_type_t{ - float_type, - double_type, - half_type, - int_type -}; - - -// NOTE: Currently only NCHW is supported due to limited cuDNN support -enum Tensor_format_t{ - nchw, - nhwc -}; -*/ - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h index 5d1e0e66ad1a3402981682ed97e664ddcc173787..7bcfda70080688387e9bb74e8d25a1174a3e7337 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h @@ -4,9 +4,9 @@ #define UTILS_HEADER #include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> #include <sstream> #include <vector> #include <bits/stdc++.h> @@ -15,737 +15,341 @@ #include <cmath> #include <string.h> - std::vector<float> run_accuracies; std::string model_params_path = "../../../build/model_params/"; - -void printTensorInfo(void* tensor_ptr){ - - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - - if(tensor->gpu_data != NULL){ - printf("Successful cudaMalloc \n"); - } - - printf("tensor dims = %d \n", tensor->dims.num_dims); - printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]); - printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]); - printf("num_elems = %lu \n", tensor->num_elems); -} - - // FIXIT: Move this to debug.h and include in all files -void dumpWeightsToFile(const char* file_name, void* weights_ptr){ +void dumpWeightsToFile(const char *file_name, void *weights_ptr) { - struct Tensor* weights = (Tensor*) weights_ptr; + struct Tensor *weights = (Tensor *)weights_ptr; // Move data back to host hpvm_request_tensor(weights, 0); - - FILE* fp = fopen(file_name, "wb"); - if(fp == NULL){ - printf("File %s could not be created. Check if directory exists \n", file_name); + + FILE *fp = fopen(file_name, "wb"); + if (fp == NULL) { + printf("File %s could not be created. Check if directory exists \n", + file_name); abort(); } - //printf("size_in_bytes = %lu \n", weights->size_in_bytes); - size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp); - //printf("bytes_written = %lu \n", bytes_written); fclose(fp); } +void fillTensorWithOnes(void *tensor_ptr) { + struct Tensor *tensor = (struct Tensor *)tensor_ptr; -void fillTensorWithOnes(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = 1.0; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = 1.0; } } } +void fillWithOnesAndTwos(void *tensor_ptr) { -void fillWithOnesAndTwos(void* tensor_ptr){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ + for (unsigned int i = 0; i < tensor->num_elems; i++) { if (i % 2 == 0) data_arr[i] = 1.0; else - data_arr[i] = 2.0; + data_arr[i] = 2.0; } /*for(unsigned int i = 0; i < tensor->num_elems/2; i++){ - data_arr[i] = 1.0; + data_arr[i] = 1.0; } for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){ - data_arr[i] = 2.0; + data_arr[i] = 2.0; }*/ - } } +void fillTensorWithVal(void *tensor_ptr, float target_value) { -void fillTensorWithVal(void* tensor_ptr, float target_value){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = target_value; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = target_value; } } } +void fillTensorWithNegOnes(void *tensor_ptr) { -void fillTensorWithNegOnes(void* tensor_ptr){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - - // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = -1.0; - } - } -} - -void fillTensorVals(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = i + 1; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = -1.0; } } } +void printTensorValues(void *tensor_ptr) { -void printTensorValues(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; hpvm_request_tensor(tensor, 0); - + // printing is specific to the floating point type - if(tensor->data_type != CUDNN_DATA_FLOAT){ - //printf("\n WARNING: The tensor is non-float type tensor \n\n"); - } + if (tensor->data_type != CUDNN_DATA_FLOAT) { + // printf("\n WARNING: The tensor is non-float type tensor \n\n"); + } - float* data_arr = (float*) tensor->host_data; + float *data_arr = (float *)tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - printf("%f,", data_arr[i]); + for (unsigned int i = 0; i < tensor->num_elems; i++) { + printf("%f,", data_arr[i]); } - printf("\n"); } +void printTensorDims(void *tensor_ptr) { -void printTensorDims(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; printf("Num_elems = %lu \n", tensor->num_elems); - for (int i = 0; i < tensor->dims.num_dims; i++){ + for (int i = 0; i < tensor->dims.num_dims; i++) { printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]); } } - - -void compareTensors(void* tensor1_ptr, void* tensor2_ptr){ - - struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr; - struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr; - - hpvm_request_tensor(tensor1, 0); - hpvm_request_tensor(tensor2, 0); - - float* tensor_data1 = (float*) tensor1->host_data; - float* tensor_data2 = (float*) tensor2->host_data; - - for(unsigned int i = 0; i < tensor1->num_elems; i++){ - if(tensor_data1[i] != tensor_data2[i]){ - printf("Tensor data mismatch at index %d \n", i); - abort(); - } - } -} - - - -void compareValues(void* tensor_ptr, float* data, size_t num_elems){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - - hpvm_request_tensor(tensor, 0); - - float* tensor_data = (float*) tensor->host_data; - for(unsigned int i = 0; i < num_elems; i++){ - if(tensor_data[i] != data[i]){ - printf("Tensor data mismatch"); - abort(); - } - } -} - - -void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ - - int type_size = 4; // NOTE: Assuming floating point tensors - int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems); - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - int file_header_size = 16; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting... \n", file_name); - abort(); - } - - - fseek(file, file_header_size, SEEK_CUR); // Skipping the file header - size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file); - - fclose(file); - - for (size_t i = 0; i < num_elems; ++i){ - tensor_data[i] = (float) file_data[i] / 255.0f; - } - - // NOTE: Using NCHW format - struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - - initTensorData(input, tensor_data, size_in_bytes); - // compareValues(input, tensor_data, num_elems); - - return input; -} - - -//*** FIXIT: Move this to CPU-only -struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ +struct Tensor *readTrainedWeights(const char *file_name, int data_type, + long int dim1_size, long int dim2_size, + long int dim3_size, long int dim4_size) { // FIXIT: Don't assume floating point types int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting... \n", file_name); - abort(); - } - - fseek(file, file_header_size, SEEK_CUR); // Skipping the file header - size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - - //printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); - - fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - - initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); - free(tensor_data); - - return weights; -} + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); + // printf("size_in_bytes = %lu \n", size_in_bytes); - -struct Tensor* readTrainedWeights(const char* file_name, int data_type, - long int dim1_size, long int dim2_size, - long int dim3_size, long int dim4_size){ - - // FIXIT: Don't assume floating point types - int type_size = 4; // NOTE: Assuming floating point tensors - long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - //printf("size_in_bytes = %lu \n", size_in_bytes); - int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_CUR); // Skipping the file header - size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); + fread(tensor_data, 1, size_in_bytes, file); + fclose(file); - // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); - fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); + // compareValues(weights, tensor_data, num_elems); free(tensor_data); return weights; } - - - -struct Tensor* readInputBatch(const char* file_name, int data_type, - long int start, long int end, - long int dim2_size, long int dim3_size, long int dim4_size){ +struct Tensor *readInputBatch(const char *file_name, int data_type, + long int start, long int end, long int dim2_size, + long int dim3_size, long int dim4_size) { long int dim1_size = end - start; // FIXIT: Don't assume floating point types long int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); + long int file_header_size = + type_size * start * dim2_size * dim3_size * dim4_size; + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header - size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); + fread(tensor_data, 1, size_in_bytes, file); + fclose(file); + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); - fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - initTensorData(weights, tensor_data, size_in_bytes); free(tensor_data); return weights; } - - -void* copyInputBatch(const char* file_name, - int start, int end, - long int dim2_size, long int dim3_size, long int dim4_size, - void* inputTensor_ptr){ - - struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr; - - long int dim1_size = end - start; - // FIXIT: Don't assume floating point types - int type_size = 4; // NOTE: Assuming floating point tensors - long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting... \n", file_name); - abort(); - } - - fseek(file, file_header_size, SEEK_SET); // Skipping the file header - size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - - fclose(file); - - - initTensorData(inputTensor, tensor_data, size_in_bytes); - free(tensor_data); - - printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims); - if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL) - printf("ERROR: NULL data pointers \n"); - - - // Chaning Tensor Placement to HOST - changeTensorPlacement(inputTensor, HOST); - - - return inputTensor; -} - - - -uint8_t* readLabels(const char* labels_file, int num_labels){ - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting...\n", labels_file); - abort(); - } - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); - - fclose(file); - - return labels; -} - - - -uint32_t* readLabels3(const char* labels_file, int num_labels){ - - uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting...\n", labels_file); - abort(); - } - - size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file); - - fclose(file); - - return labels; -} - - -uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ +uint8_t *readLabelsBatch(const char *labels_file, int start, int end) { int num_labels = end - start; int file_header_size = sizeof(uint8_t) * start; - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + + uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } - - fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header + fread(labels, 1, sizeof(uint8_t) * num_labels, file); fclose(file); - + // printf("--labels bytes_read = %lu \n", bytes_read); return labels; } - -uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){ +uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) { int num_labels = end - start; int file_header_size = sizeof(uint32_t) * start; - - uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + + uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } - - fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file); - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header + fread(labels, 1, sizeof(uint32_t) * num_labels, file); fclose(file); - - return labels; -} - - - -void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - uint8_t* labels = readLabels(labels_file, num_labels); - size_t batch_dim = result->dims.dim_sizes[0]; - size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - for(int i = 0; i < batch_dim; i++){ - int chosen = 0; - for (int id = 1; id < 10; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; - } - - //printf("chosen = %d, label = %d \n", chosen, labels[i]); - if(chosen != labels[i]) - num_errors++; - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - - std::ostringstream ss; - ss << std::fixed << accuracy; - std::string print_str = ss.str(); - - fwrite(print_str.c_str(), 1, print_str.length(), fp); - fclose(fp); - } - + return labels; } +// NOTE: batch_size and num_classes are Unused arguments +float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr, + size_t num_classes = 10) { + struct Tensor *result = (struct Tensor *)result_ptr; - -// NOTE: batch_size and num_classes are Unused arguments -float computeAccuracy2(uint8_t* labels, int batch_size, - void* result_ptr, size_t num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes); - - for(unsigned int i = 0; i < batch_dim; i++){ - - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; - } - - if(chosen != labels[i]) - num_errors++; - - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + for (unsigned int i = 0; i < batch_dim; i++) { - std::ostringstream ss; - ss << std::fixed << accuracy; - std::string print_str = ss.str(); - - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); - - return accuracy; -} - - - -float computeAccuracy3(uint32_t* labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - size_t batch_dim = result->dims.dim_sizes[0]; - size_t num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes); - - for(int i = 0; i < batch_dim; i++){ - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; + for (size_t id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; } - - if(chosen != labels[i]) + + if (chosen != labels[i]) num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); - return accuracy; + return accuracy; } +float computeAccuracy3(uint32_t *labels, void *result_ptr) { + struct Tensor *result = (struct Tensor *)result_ptr; -struct ClassProb{ - float prob; - int index; -}; - - -bool descendFloatComp(ClassProb obj1, ClassProb obj2){ - return obj1.prob > obj2.prob; -} - - -float computeTop5Accuracy(uint8_t* labels, int num_labels, - void* result_ptr, unsigned num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; - size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + size_t num_classes = result->dims.dim_sizes[1]; + float *data = (float *)result->host_data; int num_errors = 0; - printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ + printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes); - std::vector<ClassProb> elem_probs; - for (int id = 0; id < num_classes; ++id){ - ClassProb cProb; - cProb.prob = data[i * channels + id]; - cProb.index = id; - elem_probs.push_back(cProb); - } + for (size_t i = 0; i < batch_dim; i++) { - std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); - // Check if any of top-5 predictions matches - bool matched = false; - for(int j = 0; j < 5; j++){ - ClassProb cProb = elem_probs[j]; - if(cProb.index == labels[i]) - matched = true; + uint32_t chosen = 0; + for (size_t id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; } - if(!matched) - num_errors +=1; + if (chosen != labels[i]) + num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); - return accuracy; + return accuracy; } - - - -void dumpFinalAccuracy(float accuracy){ +void dumpFinalAccuracy(float accuracy) { printf("\n\n **** Final Accuracy = %f \n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); - - run_accuracies.push_back(accuracy); -} - - -void dumpAvgPSNR(float avg_psnr){ - - FILE* fp = fopen("avg_psnr", "w+"); - if(fp != NULL){ - std::ostringstream ss; - ss << std::fixed << avg_psnr; - std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); -} - - -void dumpPSNRStd(float psnr_std){ - FILE* fp = fopen("psnr_std.txt", "w+"); - if(fp != NULL){ - std::ostringstream ss; - ss << std::fixed << psnr_std; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); + run_accuracies.push_back(accuracy); } +void dumpExecutionAccuracies() { - - - -void dumpExecutionAccuracies(){ - - FILE* fp = fopen("run_accuracies.txt", "w+"); - if(fp != NULL){ - for (int i = 0; i < run_accuracies.size(); i++){ + FILE *fp = fopen("run_accuracies.txt", "w+"); + if (fp != NULL) { + for (size_t i = 0; i < run_accuracies.size(); i++) { float accuracy = run_accuracies[i]; std::ostringstream ss; ss << std::fixed << accuracy; @@ -753,242 +357,8 @@ void dumpExecutionAccuracies(){ fwrite(print_str.c_str(), 1, print_str.length(), fp); fwrite("\n", 1, 1, fp); } - - } - - fclose(fp); -} - - -float readPSNRFromFile(const char* file_name){ - - float psnr; - FILE* pFile = fopen(file_name, "r"); - if(pFile == NULL){ - printf("ERROR: psnr.txt not found! \n"); - abort(); - } - - fscanf(pFile, "%f", &psnr); - printf("**** PSNR read = %f \n\n", psnr); - return psnr; -} - - -float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){ - - - PSNR_threshold = readPSNRFromFile("psnr.txt"); - std::vector<float> psnr_list; - - struct Tensor* gold_tensor = (struct Tensor*) gold_ptr; - struct Tensor* approx_tensor = (struct Tensor*) approx_ptr; - - size_t* dim_sizes = gold_tensor->dims.dim_sizes; - size_t batch_dim = dim_sizes[0]; - size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - - printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size); - - float* gold_data = (float*) gold_tensor->host_data; - float* approx_data = (float*) approx_tensor->host_data; - - FILE* fp = fopen("img_psnr.txt", "w+"); - - float sum_psnr = 0.0; - int num_errors = 0; - for(size_t i = 0; i < batch_dim; i++){ - float mse_sum = 0.0; - float max_val = -999999; - size_t offset = i * image_size; - - for(size_t j = 0; j < image_size; j++){ - float diff = gold_data[offset + j] - approx_data[offset + j]; - float diff_square = diff * diff; - mse_sum += diff_square; - - if(max_val < gold_data[offset + j]){ - max_val = gold_data[offset + j]; - } - } - - mse_sum = mse_sum / image_size; - float psnr = 20 * log10(255 / sqrt(mse_sum)); - - sum_psnr += psnr; - if (psnr < PSNR_threshold) - num_errors += 1; - - printf("PSNR value = %f \n", psnr); - psnr_list.push_back(psnr); - - std::ostringstream ss; - ss << std::fixed << psnr; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - fwrite("\n", 1, 1, fp); } - float violation_rate = (num_errors * 1.0) / batch_dim * 100.0; - printf("*** violation_rate= %f \n\n", violation_rate); - - float avg_psnr = sum_psnr / batch_dim; - printf("*** avg_psnr = %f \n\n", avg_psnr); - dumpAvgPSNR(avg_psnr); - - float success_rate = 100.0 - violation_rate; - dumpFinalAccuracy(success_rate); - fclose(fp); - - - float var = 0.0; - for(size_t i = 0; i < batch_dim; i++){ - var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); - } - - var /= batch_dim; - float std = sqrt(var); - - dumpPSNRStd(std); - - return violation_rate; -} - - -void dumpOutput(void* output_ptr, const char* file_name){ - - struct Tensor* out_tensor = (struct Tensor*) output_ptr; - size_t size_in_bytes = out_tensor->size_in_bytes; - printf ("** Output size = %lu \n", size_in_bytes); - - float* host_data = (float*) out_tensor->host_data; - FILE* fd = fopen(file_name, "w+"); - fwrite(host_data, 1, size_in_bytes, fd); - fclose(fd); -} - - - - - -void copyClassConfsAndLabels(void* result_ptr, - float* classConfs, - int* predictedLabels, - int start, int end){ - - - struct Tensor* result = (struct Tensor*) result_ptr; - - size_t num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - - - int it_count = end - start; - for(int i = 0; i < it_count; i++){ - - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; - } - - predictedLabels[start + i] = chosen; - classConfs[start + i] = data[i * num_classes + chosen]; - } - - -} - - -void dumpClassConfsAndLabels(float* classConfs, - int* predictedLabels, - uint32_t* goldLabels, - int test_input_size){ - - FILE* labels_fp = fopen("predicted_confs_labels.txt", "w+"); - - for (int i = 0; i < test_input_size; i++){ - - int label = predictedLabels[i]; - int gold_label = (int) goldLabels[i]; - float conf = classConfs[i]; - - std::ostringstream ss; - ss << std::fixed << conf; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), labels_fp); - fwrite(" ", 1, 1, labels_fp); - - - std::ostringstream label_ss; - label_ss << label; - std::string label_str = label_ss.str(); - fwrite(label_str.c_str(), 1, label_str.length(), labels_fp); - fwrite(" ", 1, 1, labels_fp); - - - std::ostringstream gold_ss; - gold_ss << gold_label; - std::string gold_str = gold_ss.str(); - fwrite(gold_str.c_str(), 1, gold_str.length(), labels_fp); - fwrite("\n", 1, 1, labels_fp); - - - } - - fclose(labels_fp); } - - - - - -/**** Routines for Handling Piped Execution ***/ -void stallOnOpenTunerSignal(){ - - const char* myfifo = "/tmp/opentuner_fifo"; - int fd = open(myfifo, O_RDONLY); - if (fd == -1){ - printf("OpenTuner pipe could not be opened \n"); - abort(); - } - - int ret_val = fcntl(fd, F_GETFD); - if(ret_val == -1){ - printf("Invalid descriptor \n"); - abort(); - } - - char str[100]; - read(fd, str, 100); - readOpenTunerFlags("promise_flags"); - - - if(strcmp(str, "stop_run") == 0){ - abort(); - } - - close(fd); -} - - - -void signalPipeToOpenTuner(){ - - const char* myfifo = "/tmp/opentuner_fifo"; - int fd_out = open(myfifo, O_WRONLY); - int ret_val = fcntl(fd_out, F_GETFD); - if(ret_val == -1){ - printf("Invalid descriptor \n"); - abort(); - } - - const char* str = "completed***!\n\0"; - write(fd_out, str, 80); - close(fd_out); -} - - - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h deleted file mode 100644 index 45ef7211a4c04f15d1763fde729b4ca550851008..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h +++ /dev/null @@ -1,467 +0,0 @@ - -// Header guards -#ifndef UTILS_HEADER -#define UTILS_HEADER - - -#include <sstream> -#include <vector> -#include <bits/stdc++.h> -#include "../../tensor_runtime/include/tensor_cpu.h" -#include "../../tensor_runtime/include/tensor_cpu_runtime.h" -//#include "types.h" -#include <cmath> -#include <stdint.h> - - -std::vector<float> run_accuracies; - - -void printTensorInfo(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - - if(tensor->gpu_data != NULL){ - printf("Successful cudaMalloc \n"); - } - - printf("tensor dims = %d \n", tensor->dims.num_dims); - printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]); - printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]); - printf("num_elems = %lu \n", tensor->num_elems); -} - - - -void printTensorDims(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - - printf("Num_elems = %lu \n", tensor->num_elems); - for (int i = 0; i < tensor->dims.num_dims; i++){ - printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]); - } -} - - - -void compareTensors(void* tensor1_ptr, void* tensor2_ptr){ - - struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr; - struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr; - - //hpvm_request_tensor(tensor1, 0); - //hpvm_request_tensor(tensor2, 0); - - float* tensor_data1 = (float*) tensor1->host_data; - float* tensor_data2 = (float*) tensor2->host_data; - - for(unsigned int i = 0; i < tensor1->num_elems; i++){ - if(tensor_data1[i] != tensor_data2[i]){ - printf("Tensor data mismatch at index %d \n", i); - abort(); - } - } -} - - - -//*** FIXIT: Move this to CPU-only -struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ - - // FIXIT: Don't assume floating point types - int type_size = 4; // NOTE: Assuming floating point tensors - long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting... \n", file_name); - abort(); - } - - fseek(file, file_header_size, SEEK_CUR); // Skipping the file header - size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - - printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); - - fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - - initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); - free(tensor_data); - - return weights; -} - - -struct Tensor* readTrainedWeights(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ - - return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size, dim3_size, dim4_size); -} - - - -uint8_t* readLabels(const char* labels_file, int num_labels){ - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting...\n", labels_file); - abort(); - } - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); - - fclose(file); - - return labels; -} - - -uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ - - int num_labels = end - start; - int file_header_size = sizeof(uint8_t) * start; - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ - printf("Data file %s is not found. Aborting...\n", labels_file); - abort(); - } - - fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); - - - fclose(file); - - return labels; -} - - - -void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - uint8_t* labels = readLabels(labels_file, num_labels); - size_t batch_dim = result->dims.dim_sizes[0]; - size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - for(int i = 0; i < batch_dim; i++){ - int chosen = 0; - for (int id = 1; id < 10; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; - } - - if(chosen != labels[i]) - num_errors++; - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - fprintf(fp, "%f", accuracy); - fclose(fp); - } - -} - - - -float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ - - unsigned num_zeros = 0; - - struct Tensor* result = (struct Tensor*) result_ptr; - - size_t batch_dim = result->dims.dim_sizes[0]; - size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; - } - - if(labels[i] == 0) - num_zeros++; - - if(chosen != labels[i]) - num_errors++; - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - fprintf(fp, "%f", accuracy); - } - - fclose(fp); - - return accuracy; -} - - -struct ClassProb{ - float prob; - int index; -}; - - -bool descendFloatComp(ClassProb obj1, ClassProb obj2){ - return obj1.prob > obj2.prob; -} - - -float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - size_t batch_dim = result->dims.dim_sizes[0]; - size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; - int num_errors = 0; - - printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ - - std::vector<ClassProb> elem_probs; - for (int id = 0; id < num_classes; ++id){ - ClassProb cProb; - cProb.prob = data[i * channels + id]; - cProb.index = id; - elem_probs.push_back(cProb); - } - - std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); - // Check if any of top-5 predictions matches - bool matched = false; - for(int j = 0; j < 5; j++){ - ClassProb cProb = elem_probs[j]; - if(cProb.index == labels[i]) - matched = true; - } - - if(!matched) - num_errors +=1; - } - - float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; - printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - fprintf(fp, "%f", accuracy); - } - - fclose(fp); - - return accuracy; -} - - - - -void dumpFinalAccuracy(float accuracy){ - - printf("\n\n **** Final Accuracy = %f \n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ - fprintf(fp, "%f", accuracy); - } - - fclose(fp); - - run_accuracies.push_back(accuracy); -} - - - -/*void dumpAvgPSNR(float avg_psnr){ - - FILE* fp = fopen("avg_psnr", "w+"); - if(fp != NULL){ - std::ostringstream ss; - ss << std::fixed << avg_psnr; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); -} -*/ - -/*void dumpPSNRStd(float psnr_std){ - - FILE* fp = fopen("psnr_std.txt", "w+"); - if(fp != NULL){ - std::ostringstream ss; - ss << std::fixed << psnr_std; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - } - - fclose(fp); -}*/ - - - - -/* -void dumpExecutionAccuracies(){ - - FILE* fp = fopen("run_accuracies.txt", "w+"); - if(fp != NULL){ - for (int i = 0; i < run_accuracies.size(); i++){ - float accuracy = run_accuracies[i]; - std::ostringstream ss; - ss << std::fixed << accuracy; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - fwrite("\n", 1, 1, fp); - } - - } - - fclose(fp); -} -*/ - -float readPSNRFromFile(const char* file_name){ - - float psnr; - FILE* pFile = fopen(file_name, "r"); - if(pFile == NULL){ - printf("ERROR: psnr.txt not found! \n"); - abort(); - } - - fscanf(pFile, "%f", &psnr); - printf("**** PSNR read = %f \n\n", psnr); - return psnr; -} - - -/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){ - - - PSNR_threshold = readPSNRFromFile("psnr.txt"); - std::vector<float> psnr_list; - - struct Tensor* gold_tensor = (struct Tensor*) gold_ptr; - struct Tensor* approx_tensor = (struct Tensor*) approx_ptr; - - size_t* dim_sizes = gold_tensor->dims.dim_sizes; - size_t batch_dim = dim_sizes[0]; - size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - - printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size); - - float* gold_data = (float*) gold_tensor->host_data; - float* approx_data = (float*) approx_tensor->host_data; - - FILE* fp = fopen("img_psnr.txt", "w+"); - - float sum_psnr = 0.0; - int num_errors = 0; - for(size_t i = 0; i < batch_dim; i++){ - float mse_sum = 0.0; - float max_val = -999999; - size_t offset = i * image_size; - - for(size_t j = 0; j < image_size; j++){ - float diff = gold_data[offset + j] - approx_data[offset + j]; - float diff_square = diff * diff; - mse_sum += diff_square; - - if(max_val < gold_data[offset + j]){ - max_val = gold_data[offset + j]; - } - } - - mse_sum = mse_sum / image_size; - float psnr = 20 * log10(255 / sqrt(mse_sum)); - - sum_psnr += psnr; - if (psnr < PSNR_threshold) - num_errors += 1; - - printf("PSNR value = %f \n", psnr); - psnr_list.push_back(psnr); - - std::ostringstream ss; - ss << std::fixed << psnr; - std::string print_str = ss.str(); - fwrite(print_str.c_str(), 1, print_str.length(), fp); - fwrite("\n", 1, 1, fp); - } - - float violation_rate = (num_errors * 1.0) / batch_dim * 100.0; - printf("*** violation_rate= %f \n\n", violation_rate); - - float avg_psnr = sum_psnr / batch_dim; - printf("*** avg_psnr = %f \n\n", avg_psnr); - dumpAvgPSNR(avg_psnr); - - float success_rate = 100.0 - violation_rate; - dumpFinalAccuracy(success_rate); - - fclose(fp); - - - float var = 0.0; - for(size_t i = 0; i < batch_dim; i++){ - var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); - } - - var /= batch_dim; - float std = sqrt(var); - - //dumpPSNRStd(std); - - return violation_rate; -}*/ - - -void dumpOutput(void* output_ptr, const char* file_name){ - - struct Tensor* out_tensor = (struct Tensor*) output_ptr; - size_t size_in_bytes = out_tensor->size_in_bytes; - printf ("** Output size = %lu \n", size_in_bytes); - - float* host_data = (float*) out_tensor->host_data; - FILE* fd = fopen(file_name, "w+"); - fwrite(host_data, 1, size_in_bytes, fd); - fclose(fd); -} - - - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc index d93110945b1d1a70ec29c7788d9133dc16551ee5..8133e86ef9735932607b5548cec5910a907f7b3c 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc @@ -1,60 +1,64 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> - #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" - - /* NOTE: Reference Architecture to use for profiling */ -void testCifarNet(){ +void testCifarNet() { printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); - - std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + std::string dir_prefix = + model_params_path + std::string("/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum startMemTracking(); @@ -65,61 +69,61 @@ void testCifarNet(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv1out, conv2d_1_b); - void* conv1_tanh = tensorHalfTanh(conv1out); - + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv1out, conv2d_1_b); + void *conv1_tanh = tensorHalfTanh(conv1out); + // 2nd Layer - void* conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv2out, conv2d_2_b); - void* conv2_tanh = tensorHalfTanh(conv2out); - void* pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv2out, conv2d_2_b); + void *conv2_tanh = tensorHalfTanh(conv2out); + void *pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + // 3rd Layer - void* conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv3out, conv2d_3_b); - void* conv3_tanh = tensorHalfTanh(conv3out); + void *conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv3out, conv2d_3_b); + void *conv3_tanh = tensorHalfTanh(conv3out); // 4th Layer - void* conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv4out, conv2d_4_b); - void* conv4_tanh = tensorHalfTanh(conv4out); - void* pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv4out, conv2d_4_b); + void *conv4_tanh = tensorHalfTanh(conv4out); + void *pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + // 5th Layer - void* conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv5out, conv2d_5_b); - void* conv5_tanh = tensorHalfTanh(conv5out); + void *conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv5out, conv2d_5_b); + void *conv5_tanh = tensorHalfTanh(conv5out); // 6th Layer - void* conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, - conv_mode, conv_precision); + void *conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, + conv_mode, conv_precision); tensorHalfAdd(conv6out, conv2d_6_b); - void* conv6_tanh = tensorHalfTanh(conv6out); - void* pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv6_tanh = tensorHalfTanh(conv6out); + void *pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + // final FC Layer - void* gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w); - void* gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b); - void* result = tensorSoftmax(gemm1biasout); + void *gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w); + void *gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b); + void *result = tensorSoftmax(gemm1biasout); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels, batch_size, result); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -127,11 +131,9 @@ void testCifarNet(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); @@ -141,4 +143,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc index b7695bbd7a24712e335f0cf8bbd25290f3261dea..020ad6d578bea8acae8cce5373bdf37ec7df1fd9 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc @@ -1,49 +1,52 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -54,40 +57,40 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfTanh(var_1); - void* var_3 = tensorHalfPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorHalfAdd(var_5, conv2d_2_b); - void* var_7 = tensorHalfTanh(var_6); - void* var_8 = tensorHalfPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); - void* var_12 = tensorHalfTanh(var_11); - void* var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorHalfAdd(var_13, conv2d_4_b); - void* var_15 = tensorHalfTanh(var_14); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfTanh(var_17); - void* var_19 = tensorHalfPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorHalfGemmGPU(var_19, dense_1_w); - void* var_23 = tensorHalfAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_24); + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfTanh(var_1); + void *var_3 = tensorHalfPooling(var_2, 0, 2, 2, 0, 0, 2, 2); + void *var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); + void *var_6 = tensorHalfAdd(var_5, conv2d_2_b); + void *var_7 = tensorHalfTanh(var_6); + void *var_8 = tensorHalfPooling(var_7, 0, 2, 2, 0, 0, 2, 2); + void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorHalfAdd(var_10, conv2d_3_b); + void *var_12 = tensorHalfTanh(var_11); + void *var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_14 = tensorHalfAdd(var_13, conv2d_4_b); + void *var_15 = tensorHalfTanh(var_14); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfTanh(var_17); + void *var_19 = tensorHalfPooling(var_18, 0, 2, 2, 0, 0, 2, 2); + void *var_22 = tensorHalfGemmGPU(var_19, dense_1_w); + void *var_23 = tensorHalfAdd(var_22, dense_1_b); + void *var_24 = tensorSoftmax(var_23); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_24); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -96,9 +99,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc index 29f392c630a36a6044c5f804e5d3a7b252591831..0fb39cbe84af998ad42c9c14915e272aa3dab88d 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc @@ -1,115 +1,101 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> - - #include "tensor_runtime.h" #include "utils.h" - /* NOTE: Reference Architecture to use for profiling */ -void testLenetTanh(){ +void testLenetTanh() { int total_runs = 1; printf("********* Lenet-2 Architecture ********** \n"); // FIXIT: Extend this to batch of images - currently 5 images int test_batch_size = 5000; - std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - // Loading Input Batch - void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); - - void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin", - float_type, 32, 1, 5, 5); - void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin", - float_type, 1, 32, 1, 1); - void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin", - float_type, 64, 32, 5, 5); - void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin", - float_type, 1, 64, 1, 1); - void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", - float_type, 1, 1, 7*7*64, 1024); - void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin", - float_type, 1, 1024, 1, 1); - void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", - float_type, 1, 1, 1024, 10); - void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin", - float_type, 1, 10, 1, 1); - - - + void *input = + readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size); + + void *conv1_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5); + void *conv1_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1); + void *conv2_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5); + void *conv2_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1); + void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", + float_type, 1, 1, 7 * 7 * 64, 1024); + void *fc1_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1); + void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", + float_type, 1, 1, 1024, 10); + void *fc2_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1); + clearTensorMap(); - - for(int i = 0; i < total_runs; i++){ + + for (int i = 0; i < total_runs; i++) { readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters - // Start power and performnce profiling + // Start power and performnce profiling startProfiling(); - + int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum // NOTE: 'SAME' convolution - void* conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + void *conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision); - // NOTE: For tensorAdd, the only dimension that MUST match is channels + // NOTE: For tensorAdd, the only dimension that MUST match is channels tensorHalfAdd(conv1out, conv1_bias); // NOTE: In place operation - void* pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + void *pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); - void* conv1_tanh = tensorHalfTanh(pool1out); + void *conv1_tanh = tensorHalfTanh(pool1out); - // NOTE: input channels have to match between tensor op inputs and outputs - void* conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + // NOTE: input channels have to match between tensor op inputs and outputs + void *conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision); tensorHalfAdd(conv2out, conv2_bias); // NOTE: In place operation - void* pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + void *pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void *conv2_tanh = tensorHalfTanh(pool2out); - void* conv2_tanh = tensorHalfTanh(pool2out); + void *gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights); - void* gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights); + void *gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias); - void* gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias); + void *tanh1out = tensorHalfTanh(gemm1biasout); - void* tanh1out = tensorHalfTanh(gemm1biasout); - - void* gemm2out = tensorHalfGemm(tanh1out, fc2_weights); - - void* gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias); + void *gemm2out = tensorHalfGemm(tanh1out, fc2_weights); - void* tanh2out = tensorHalfTanh(gemm2_biasout); - - void* result = tensorSoftmax(tanh2out); + void *gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias); + + void *tanh2out = tensorHalfTanh(gemm2_biasout); + + void *result = tensorSoftmax(tanh2out); // End profiling and dump output to profile.txt stopProfiling(); - + computeAccuracy2(labels, test_batch_size, result); - + dumpAccuracyNorms(); freeOutputTensors(); } - - - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); testLenetTanh(); @@ -118,4 +104,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc index d662dc1584c7810d8d3631d5ac16c427c3ff8b02..7722447047aaac6dc679fb02c16e6b2c20c2c049 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc @@ -1,411 +1,725 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> + #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = model_params_path + std::string("/mobilenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); - std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); - void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); - void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); - void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); - void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); - void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); - void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_7_w_path = dir_prefix + std::string("depthwise_conv2d_7_w.bin"); - void* depthwise_conv2d_7_w = readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_8_w_path = dir_prefix + std::string("depthwise_conv2d_8_w.bin"); - void* depthwise_conv2d_8_w = readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_9_w_path = dir_prefix + std::string("depthwise_conv2d_9_w.bin"); - void* depthwise_conv2d_9_w = readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_10_w_path = dir_prefix + std::string("depthwise_conv2d_10_w.bin"); - void* depthwise_conv2d_10_w = readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_11_w_path = dir_prefix + std::string("depthwise_conv2d_11_w.bin"); - void* depthwise_conv2d_11_w = readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_12_w_path = dir_prefix + std::string("depthwise_conv2d_12_w.bin"); - void* depthwise_conv2d_12_w = readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); - std::string depthwise_conv2d_13_w_path = dir_prefix + std::string("depthwise_conv2d_13_w.bin"); - void* depthwise_conv2d_13_w = readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - +int main() { + llvm_hpvm_initTensorRt(0); - startMemTracking(); + std::string dir_prefix = model_params_path + std::string("/mobilenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string depthwise_conv2d_1_w_path = + dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void *depthwise_conv2d_1_w = + readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string depthwise_conv2d_2_w_path = + dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void *depthwise_conv2d_2_w = + readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_3_w_path = + dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void *depthwise_conv2d_3_w = + readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_4_w_path = + dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void *depthwise_conv2d_4_w = + readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_5_w_path = + dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void *depthwise_conv2d_5_w = + readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_6_w_path = + dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void *depthwise_conv2d_6_w = + readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_7_w_path = + dir_prefix + std::string("depthwise_conv2d_7_w.bin"); + void *depthwise_conv2d_7_w = + readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_8_w_path = + dir_prefix + std::string("depthwise_conv2d_8_w.bin"); + void *depthwise_conv2d_8_w = + readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_9_w_path = + dir_prefix + std::string("depthwise_conv2d_9_w.bin"); + void *depthwise_conv2d_9_w = + readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_10_w_path = + dir_prefix + std::string("depthwise_conv2d_10_w.bin"); + void *depthwise_conv2d_10_w = + readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_11_w_path = + dir_prefix + std::string("depthwise_conv2d_11_w.bin"); + void *depthwise_conv2d_11_w = + readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_12_w_path = + dir_prefix + std::string("depthwise_conv2d_12_w.bin"); + void *depthwise_conv2d_12_w = + readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string depthwise_conv2d_13_w_path = + dir_prefix + std::string("depthwise_conv2d_13_w.bin"); + void *depthwise_conv2d_13_w = + readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); - int test_input_size = 2000; - int batch_size = 1000; - int batch_count = test_input_size / batch_size; + startMemTracking(); - float final_accuracy = 0.0; + int test_input_size = 2000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; - for(int i = 0; i < batch_count; i++){ + float final_accuracy = 0.0; - int start = i * batch_size; - int end = (i + 1) * batch_size; + for (int i = 0; i < batch_count; i++) { - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); + int start = i * batch_size; + int end = (i + 1) * batch_size; - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); - void* var_5 = tensorHalfBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorHalfBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_9 = tensorHalfRelu(var_8); - void* var_11 = tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); - void* var_12 = tensorHalfBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_13 = tensorHalfRelu(var_12); - void* var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); - void* var_15 = tensorHalfBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_16 = tensorHalfRelu(var_15); - void* var_18 = tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); - void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_20 = tensorHalfRelu(var_19); - void* var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_23 = tensorHalfRelu(var_22); - void* var_26 = tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); - void* var_27 = tensorHalfBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_28 = tensorHalfRelu(var_27); - void* var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_30 = tensorHalfBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_31 = tensorHalfRelu(var_30); - void* var_33 = tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); - void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_35 = tensorHalfRelu(var_34); - void* var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_38 = tensorHalfRelu(var_37); - void* var_41 = tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); - void* var_42 = tensorHalfBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_43 = tensorHalfRelu(var_42); - void* var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorHalfBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); - void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_53 = tensorHalfRelu(var_52); - void* var_55 = tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); - void* var_56 = tensorHalfBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_57 = tensorHalfRelu(var_56); - void* var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_59 = tensorHalfBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_60 = tensorHalfRelu(var_59); - void* var_63 = tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); - void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_65 = tensorHalfRelu(var_64); - void* var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); - void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_68 = tensorHalfRelu(var_67); - void* var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); - void* var_71 = tensorHalfBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_72 = tensorHalfRelu(var_71); - void* var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorHalfBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_75 = tensorHalfRelu(var_74); - void* var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); - void* var_78 = tensorHalfBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_79 = tensorHalfRelu(var_78); - void* var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); - void* var_81 = tensorHalfBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_82 = tensorHalfRelu(var_81); - void* var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); - void* var_86 = tensorHalfBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_87 = tensorHalfRelu(var_86); - void* var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); - void* var_89 = tensorHalfBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_90 = tensorHalfRelu(var_89); - void* var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); - void* var_93 = tensorHalfBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_94 = tensorHalfRelu(var_93); - void* var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_96 = tensorHalfBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_97 = tensorHalfRelu(var_96); - void* var_99 = tensorHalfPooling(var_97,1,2,2,0,0,2,2); - void* var_101 = tensorHalfGemmGPU(var_99, dense_1_w); - void* var_102 = tensorHalfAdd(var_101, dense_1_b); - void* var_103 = tensorSoftmax(var_102); + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_1 = tensorHalfBatchNorm( + var_0, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = + tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void *var_5 = tensorHalfBatchNorm( + var_4, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorHalfBatchNorm( + var_7, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_9 = tensorHalfRelu(var_8); + void *var_11 = + tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void *var_12 = tensorHalfBatchNorm( + var_11, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_13 = tensorHalfRelu(var_12); + void *var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void *var_15 = tensorHalfBatchNorm( + var_14, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_16 = tensorHalfRelu(var_15); + void *var_18 = + tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); + void *var_19 = tensorHalfBatchNorm( + var_18, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_20 = tensorHalfRelu(var_19); + void *var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_22 = tensorHalfBatchNorm( + var_21, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_23 = tensorHalfRelu(var_22); + void *var_26 = + tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void *var_27 = tensorHalfBatchNorm( + var_26, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_28 = tensorHalfRelu(var_27); + void *var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_30 = tensorHalfBatchNorm( + var_29, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_31 = tensorHalfRelu(var_30); + void *var_33 = + tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void *var_34 = tensorHalfBatchNorm( + var_33, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_35 = tensorHalfRelu(var_34); + void *var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorHalfBatchNorm( + var_36, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_38 = tensorHalfRelu(var_37); + void *var_41 = + tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); + void *var_42 = tensorHalfBatchNorm( + var_41, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_43 = tensorHalfRelu(var_42); + void *var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorHalfBatchNorm( + var_44, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = + tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); + void *var_49 = tensorHalfBatchNorm( + var_48, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_52 = tensorHalfBatchNorm( + var_51, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_53 = tensorHalfRelu(var_52); + void *var_55 = + tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); + void *var_56 = tensorHalfBatchNorm( + var_55, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_57 = tensorHalfRelu(var_56); + void *var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_59 = tensorHalfBatchNorm( + var_58, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_60 = tensorHalfRelu(var_59); + void *var_63 = + tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); + void *var_64 = tensorHalfBatchNorm( + var_63, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_65 = tensorHalfRelu(var_64); + void *var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); + void *var_67 = tensorHalfBatchNorm( + var_66, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_68 = tensorHalfRelu(var_67); + void *var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, + 1, 1, 512); + void *var_71 = tensorHalfBatchNorm( + var_70, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_72 = tensorHalfRelu(var_71); + void *var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorHalfBatchNorm( + var_73, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_75 = tensorHalfRelu(var_74); + void *var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, + 1, 1, 512); + void *var_78 = tensorHalfBatchNorm( + var_77, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_79 = tensorHalfRelu(var_78); + void *var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); + void *var_81 = tensorHalfBatchNorm( + var_80, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_82 = tensorHalfRelu(var_81); + void *var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, + 2, 1, 512); + void *var_86 = tensorHalfBatchNorm( + var_85, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_87 = tensorHalfRelu(var_86); + void *var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); + void *var_89 = tensorHalfBatchNorm( + var_88, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_90 = tensorHalfRelu(var_89); + void *var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, + 1, 1, 1024); + void *var_93 = tensorHalfBatchNorm( + var_92, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_94 = tensorHalfRelu(var_93); + void *var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_96 = tensorHalfBatchNorm( + var_95, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_97 = tensorHalfRelu(var_96); + void *var_99 = tensorHalfPooling(var_97, 1, 2, 2, 0, 0, 2, 2); + void *var_101 = tensorHalfGemmGPU(var_99, dense_1_w); + void *var_102 = tensorHalfAdd(var_101, dense_1_b); + void *var_103 = tensorSoftmax(var_102); - float accuracy = computeAccuracy2(labels, batch_size, var_103); - final_accuracy += accuracy; - freeBatchMemory(); - } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - llvm_hpvm_cleanupTensorRt(); + float accuracy = computeAccuracy2(labels, batch_size, var_103); + final_accuracy += accuracy; + freeBatchMemory(); + } + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc index 741c4a443cc9a56c443ec5858aaed5a7d5705268..db8081c6b06e3529d76b13d64f3d25691184024c 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc @@ -1,112 +1,149 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + model_params_path + std::string("/resnet18_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + // void* input = readTrainedWeights(input_path.c_str(), 0, + // batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + // uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -117,94 +154,94 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_3 = tensorHalfAdd(var_2, conv2d_1_b); - void* var_4 = tensorHalfRelu(var_3); - void* var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_7 = tensorHalfAdd(var_6, conv2d_2_b); - void* var_8 = tensorHalfRelu(var_7); - void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); - void* var_12 = tensorHalfAdd(var_4, var_11); - void* var_13 = tensorHalfRelu(var_12); - void* var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_16 = tensorHalfAdd(var_15, conv2d_4_b); - void* var_17 = tensorHalfRelu(var_16); - void* var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_20 = tensorHalfAdd(var_19, conv2d_5_b); - void* var_21 = tensorHalfAdd(var_13, var_20); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_6_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_7_b); - void* var_30 = tensorHalfAdd(var_22, var_29); - void* var_31 = tensorHalfRelu(var_30); - void* var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); - void* var_34 = tensorHalfAdd(var_33, conv2d_8_b); - void* var_35 = tensorHalfRelu(var_34); - void* var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_38 = tensorHalfAdd(var_37, conv2d_9_b); - void* var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_10_b); - void* var_42 = tensorHalfAdd(var_41, var_38); - void* var_43 = tensorHalfRelu(var_42); - void* var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_46 = tensorHalfAdd(var_45, conv2d_11_b); - void* var_47 = tensorHalfRelu(var_46); - void* var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_50 = tensorHalfAdd(var_49, conv2d_12_b); - void* var_51 = tensorHalfAdd(var_43, var_50); - void* var_52 = tensorHalfRelu(var_51); - void* var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_55 = tensorHalfAdd(var_54, conv2d_13_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); - void* var_59 = tensorHalfAdd(var_58, conv2d_14_b); - void* var_60 = tensorHalfAdd(var_52, var_59); - void* var_61 = tensorHalfRelu(var_60); - void* var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); - void* var_64 = tensorHalfAdd(var_63, conv2d_15_b); - void* var_65 = tensorHalfRelu(var_64); - void* var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); - void* var_68 = tensorHalfAdd(var_67, conv2d_16_b); - void* var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); - void* var_71 = tensorHalfAdd(var_70, conv2d_17_b); - void* var_72 = tensorHalfAdd(var_71, var_68); - void* var_73 = tensorHalfRelu(var_72); - void* var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); - void* var_76 = tensorHalfAdd(var_75, conv2d_18_b); - void* var_77 = tensorHalfRelu(var_76); - void* var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); - void* var_80 = tensorHalfAdd(var_79, conv2d_19_b); - void* var_81 = tensorHalfAdd(var_73, var_80); - void* var_82 = tensorHalfRelu(var_81); - void* var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); - void* var_85 = tensorHalfAdd(var_84, conv2d_20_b); - void* var_86 = tensorHalfRelu(var_85); - void* var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); - void* var_89 = tensorHalfAdd(var_88, conv2d_21_b); - void* var_90 = tensorHalfAdd(var_82, var_89); - void* var_91 = tensorHalfRelu(var_90); - void* var_92 = tensorHalfPooling(var_91,1,8,8,0,0,8,8); - void* var_94 = tensorHalfGemmGPU(var_92, dense_1_w); - void* var_95 = tensorHalfAdd(var_94, dense_1_b); - void* var_96 = tensorSoftmax(var_95); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_96); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_3 = tensorHalfAdd(var_2, conv2d_1_b); + void *var_4 = tensorHalfRelu(var_3); + void *var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_7 = tensorHalfAdd(var_6, conv2d_2_b); + void *var_8 = tensorHalfRelu(var_7); + void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorHalfAdd(var_10, conv2d_3_b); + void *var_12 = tensorHalfAdd(var_4, var_11); + void *var_13 = tensorHalfRelu(var_12); + void *var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_16 = tensorHalfAdd(var_15, conv2d_4_b); + void *var_17 = tensorHalfRelu(var_16); + void *var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_20 = tensorHalfAdd(var_19, conv2d_5_b); + void *var_21 = tensorHalfAdd(var_13, var_20); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_6_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_7_b); + void *var_30 = tensorHalfAdd(var_22, var_29); + void *var_31 = tensorHalfRelu(var_30); + void *var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); + void *var_34 = tensorHalfAdd(var_33, conv2d_8_b); + void *var_35 = tensorHalfRelu(var_34); + void *var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_38 = tensorHalfAdd(var_37, conv2d_9_b); + void *var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_10_b); + void *var_42 = tensorHalfAdd(var_41, var_38); + void *var_43 = tensorHalfRelu(var_42); + void *var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_46 = tensorHalfAdd(var_45, conv2d_11_b); + void *var_47 = tensorHalfRelu(var_46); + void *var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_50 = tensorHalfAdd(var_49, conv2d_12_b); + void *var_51 = tensorHalfAdd(var_43, var_50); + void *var_52 = tensorHalfRelu(var_51); + void *var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_55 = tensorHalfAdd(var_54, conv2d_13_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); + void *var_59 = tensorHalfAdd(var_58, conv2d_14_b); + void *var_60 = tensorHalfAdd(var_52, var_59); + void *var_61 = tensorHalfRelu(var_60); + void *var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); + void *var_64 = tensorHalfAdd(var_63, conv2d_15_b); + void *var_65 = tensorHalfRelu(var_64); + void *var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); + void *var_68 = tensorHalfAdd(var_67, conv2d_16_b); + void *var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); + void *var_71 = tensorHalfAdd(var_70, conv2d_17_b); + void *var_72 = tensorHalfAdd(var_71, var_68); + void *var_73 = tensorHalfRelu(var_72); + void *var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); + void *var_76 = tensorHalfAdd(var_75, conv2d_18_b); + void *var_77 = tensorHalfRelu(var_76); + void *var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); + void *var_80 = tensorHalfAdd(var_79, conv2d_19_b); + void *var_81 = tensorHalfAdd(var_73, var_80); + void *var_82 = tensorHalfRelu(var_81); + void *var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); + void *var_85 = tensorHalfAdd(var_84, conv2d_20_b); + void *var_86 = tensorHalfRelu(var_85); + void *var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); + void *var_89 = tensorHalfAdd(var_88, conv2d_21_b); + void *var_90 = tensorHalfAdd(var_82, var_89); + void *var_91 = tensorHalfRelu(var_90); + void *var_92 = tensorHalfPooling(var_91, 1, 8, 8, 0, 0, 8, 8); + void *var_94 = tensorHalfGemmGPU(var_92, dense_1_w); + void *var_95 = tensorHalfAdd(var_94, dense_1_b); + void *var_96 = tensorSoftmax(var_95); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_96); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -213,9 +250,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc index 9ac1deea68c693f8baf2df2d9f2b626b3597ad7f..1bd79c7fb71400edd900bceb42413cf4320005fe 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc @@ -1,160 +1,180 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> + #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); - - - startMemTracking(); - - int test_input_size = 2000; - int batch_size = 1000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); - void* var_10 = tensorHalfRelu(var_9); - void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); - void* var_14 = tensorHalfRelu(var_13); - void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfRelu(var_17); - void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); - void* var_30 = tensorHalfRelu(var_29); - void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); - void* var_34 = tensorHalfRelu(var_33); - void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); - void* var_38 = tensorHalfRelu(var_37); - void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); - void* var_42 = tensorHalfRelu(var_41); - void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); - void* var_55 = tensorHalfAdd(var_54, dense_1_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); - void* var_59 = tensorHalfAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); - final_accuracy += accuracy; - freeBatchMemory(); - - } - - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); + + startMemTracking(); + + int test_input_size = 2000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorHalfAdd(var_4, conv2d_2_b); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorHalfAdd(var_8, conv2d_3_b); + void *var_10 = tensorHalfRelu(var_9); + void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorHalfAdd(var_12, conv2d_4_b); + void *var_14 = tensorHalfRelu(var_13); + void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfRelu(var_17); + void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorHalfAdd(var_20, conv2d_6_b); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_7_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_8_b); + void *var_30 = tensorHalfRelu(var_29); + void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorHalfAdd(var_32, conv2d_9_b); + void *var_34 = tensorHalfRelu(var_33); + void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorHalfAdd(var_36, conv2d_10_b); + void *var_38 = tensorHalfRelu(var_37); + void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_11_b); + void *var_42 = tensorHalfRelu(var_41); + void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorHalfAdd(var_44, conv2d_12_b); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorHalfAdd(var_48, conv2d_13_b); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w); + void *var_55 = tensorHalfAdd(var_54, dense_1_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w); + void *var_59 = tensorHalfAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); + final_accuracy += accuracy; + freeBatchMemory(); + } + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + llvm_hpvm_cleanupTensorRt(); + + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc index f92bac10e27162fe0bc59c07aa4f9ede542ccd6e..22d2a3614cb668a668f60c7a3941e06d92ebf4de 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc @@ -1,82 +1,103 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -85,83 +106,82 @@ int main(){ int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; - // Start power and performance profiling + // Start power and performance profiling startProfiling(); - for(int i = 0; i < batch_count; i++){ + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); - void* var_10 = tensorHalfRelu(var_9); - void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); - void* var_14 = tensorHalfRelu(var_13); - void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfRelu(var_17); - void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); - void* var_30 = tensorHalfRelu(var_29); - void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); - void* var_34 = tensorHalfRelu(var_33); - void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); - void* var_38 = tensorHalfRelu(var_37); - void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); - void* var_42 = tensorHalfRelu(var_41); - void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); - void* var_55 = tensorHalfAdd(var_54, dense_1_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); - void* var_59 = tensorHalfAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_60); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorHalfAdd(var_4, conv2d_2_b); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorHalfAdd(var_8, conv2d_3_b); + void *var_10 = tensorHalfRelu(var_9); + void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorHalfAdd(var_12, conv2d_4_b); + void *var_14 = tensorHalfRelu(var_13); + void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfRelu(var_17); + void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorHalfAdd(var_20, conv2d_6_b); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_7_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_8_b); + void *var_30 = tensorHalfRelu(var_29); + void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorHalfAdd(var_32, conv2d_9_b); + void *var_34 = tensorHalfRelu(var_33); + void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorHalfAdd(var_36, conv2d_10_b); + void *var_38 = tensorHalfRelu(var_37); + void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_11_b); + void *var_42 = tensorHalfRelu(var_41); + void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorHalfAdd(var_44, conv2d_12_b); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorHalfAdd(var_48, conv2d_13_b); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w); + void *var_55 = tensorHalfAdd(var_54, dense_1_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w); + void *var_59 = tensorHalfAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60); final_accuracy += accuracy; - + freeBatchMemory(); } - // Start power and performance profiling + // Start power and performance profiling stopProfiling(); final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc index 50d9747f990d486c4543607d16d4a4ccb88b0517..7e2c4be6335e3de82b0719923554e17b74732b93 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc @@ -1,62 +1,64 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> - #include "../../tensor_runtime/include/tensor_runtime.h" #include "../include/utils.h" - - /* NOTE: Reference Architecture to use for profiling */ -void testCifarNet(){ +void testCifarNet() { printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); - - - std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + std::string dir_prefix = + model_params_path + std::string("/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum startMemTracking(); @@ -67,62 +69,61 @@ void testCifarNet(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv1out, conv2d_1_b); - void* conv1_tanh = tensorTanh(conv1out); - + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, conv_mode, + conv_precision); + tensorAdd(conv1out, conv2d_1_b); + void *conv1_tanh = tensorTanh(conv1out); + // 2nd Layer - void* conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv2out, conv2d_2_b); - void* conv2_tanh = tensorTanh(conv2out); - void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2d_2_b); + void *conv2_tanh = tensorTanh(conv2out); + void *pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + // 3rd Layer - void* conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv3out, conv2d_3_b); - void* conv3_tanh = tensorTanh(conv3out); + void *conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv3out, conv2d_3_b); + void *conv3_tanh = tensorTanh(conv3out); // 4th Layer - void* conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv4out, conv2d_4_b); - void* conv4_tanh = tensorTanh(conv4out); - void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv4out, conv2d_4_b); + void *conv4_tanh = tensorTanh(conv4out); + void *pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + // 5th Layer - void* conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv5out, conv2d_5_b); - void* conv5_tanh = tensorTanh(conv5out); + void *conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv5out, conv2d_5_b); + void *conv5_tanh = tensorTanh(conv5out); // 6th Layer - void* conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv6out, conv2d_6_b); - void* conv6_tanh = tensorTanh(conv6out); - void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv6out, conv2d_6_b); + void *conv6_tanh = tensorTanh(conv6out); + void *pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + // final FC Layer - void* gemm1out = tensorGemmGPU(pool6out, dense_1_w); - void* gemm1biasout = tensorAdd(gemm1out, dense_1_b); - void* result = tensorSoftmax(gemm1biasout); + void *gemm1out = tensorGemmGPU(pool6out, dense_1_w); + void *gemm1biasout = tensorAdd(gemm1out, dense_1_b); + void *result = tensorSoftmax(gemm1biasout); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels, batch_size, result); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; - freeBatchMemory(); } @@ -130,11 +131,9 @@ void testCifarNet(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); @@ -144,4 +143,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc index 1a76f1ae8ba6059124117b82cd72e8ccd6cdeba6..1cee9b4fa5dd96bf74c4662d0d8edef34f8f2282 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc @@ -1,50 +1,53 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - + +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + startMemTracking(); int test_input_size = 5000; @@ -54,40 +57,40 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorTanh(var_1); - void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorAdd(var_5, conv2d_2_b); - void* var_7 = tensorTanh(var_6); - void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorTanh(var_11); - void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorAdd(var_13, conv2d_4_b); - void* var_15 = tensorTanh(var_14); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorTanh(var_17); - void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorGemmGPU(var_19, dense_1_w); - void* var_23 = tensorAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_24); + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorTanh(var_1); + void *var_3 = tensorPooling(var_2, 0, 2, 2, 0, 0, 2, 2); + void *var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); + void *var_6 = tensorAdd(var_5, conv2d_2_b); + void *var_7 = tensorTanh(var_6); + void *var_8 = tensorPooling(var_7, 0, 2, 2, 0, 0, 2, 2); + void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorAdd(var_10, conv2d_3_b); + void *var_12 = tensorTanh(var_11); + void *var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_14 = tensorAdd(var_13, conv2d_4_b); + void *var_15 = tensorTanh(var_14); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorTanh(var_17); + void *var_19 = tensorPooling(var_18, 0, 2, 2, 0, 0, 2, 2); + void *var_22 = tensorGemmGPU(var_19, dense_1_w); + void *var_23 = tensorAdd(var_22, dense_1_b); + void *var_24 = tensorSoftmax(var_23); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_24); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -96,9 +99,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc index aa518d77a1993ce5f0f47b4a29276aae6e6de0e5..0f8df1a4207502b345aa02835a4d77368a35aa92 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc @@ -1,116 +1,119 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "tensor_runtime.h" -#include "utils.h" - - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); - std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); - void* dense_3_w = readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); - std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); - void* dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); - - - - startMemTracking(); - - int test_input_size = 1000; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); - void* var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); - void* var_8 = tensorAdd(var_7, conv2d_2_b); - void* var_9 = tensorRelu(var_8); - void* var_10 = tensorPooling(var_9,0,3,3,0,0,2,2); - void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_3_b); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); - void* var_15 = tensorAdd(var_14, conv2d_4_b); - void* var_16 = tensorRelu(var_15); - void* var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); - void* var_18 = tensorAdd(var_17, conv2d_5_b); - void* var_19 = tensorRelu(var_18); - void* var_20 = tensorPooling(var_19,0,3,3,0,0,2,2); - void* var_23 = tensorGemmGPU(var_20, dense_1_w); - void* var_24 = tensorAdd(var_23, dense_1_b); - void* var_25 = tensorRelu(var_24); - void* var_27 = tensorGemmGPU(var_25, dense_2_w); - void* var_28 = tensorAdd(var_27, dense_2_b); - void* var_29 = tensorRelu(var_28); - void* var_30 = tensorGemmGPU(var_29, dense_3_w); - void* var_31 = tensorAdd(var_30, dense_3_b); - void* var_32 = tensorSoftmax(var_31); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_32); - final_accuracy += accuracy; - freeBatchMemory(); - +#include "tensor_runtime.h" +#include "utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 9216, 4096); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = + readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); + void *dense_3_w = + readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000); + std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); + void *dense_3_b = + readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); + + startMemTracking(); + + int test_input_size = 1000; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + void *var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2); + void *var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); + void *var_8 = tensorAdd(var_7, conv2d_2_b); + void *var_9 = tensorRelu(var_8); + void *var_10 = tensorPooling(var_9, 0, 3, 3, 0, 0, 2, 2); + void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_3_b); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); + void *var_15 = tensorAdd(var_14, conv2d_4_b); + void *var_16 = tensorRelu(var_15); + void *var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); + void *var_18 = tensorAdd(var_17, conv2d_5_b); + void *var_19 = tensorRelu(var_18); + void *var_20 = tensorPooling(var_19, 0, 3, 3, 0, 0, 2, 2); + void *var_23 = tensorGemmGPU(var_20, dense_1_w); + void *var_24 = tensorAdd(var_23, dense_1_b); + void *var_25 = tensorRelu(var_24); + void *var_27 = tensorGemmGPU(var_25, dense_2_w); + void *var_28 = tensorAdd(var_27, dense_2_b); + void *var_29 = tensorRelu(var_28); + void *var_30 = tensorGemmGPU(var_29, dense_3_w); + void *var_31 = tensorAdd(var_30, dense_3_b); + void *var_32 = tensorSoftmax(var_31); + + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy3(labels, var_32); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc index 7508f3119eeb469a164fad9741000308e3e8c031..cb6593f7d5cac872159c909c99fbde478729df29 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc @@ -1,124 +1,108 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> - - #include "tensor_runtime.h" #include "utils.h" int total_runs = 1; - /* NOTE: Reference Architecture to use for profiling */ -void testLenetTanh(){ +void testLenetTanh() { printf("********* Lenet-2 Architecture ********** \n"); // FIXIT: Extend this to batch of images - currently 5 images int test_batch_size = 5000; - std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - // Loading Input Batch - void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); - - - void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin", - float_type, 32, 1, 5, 5); - void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin", - float_type, 1, 32, 1, 1); - void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin", - float_type, 64, 32, 5, 5); - void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin", - float_type, 1, 64, 1, 1); - void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", - float_type, 1, 1, 7*7*64, 1024); - void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin", - float_type, 1, 1024, 1, 1); - void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", - float_type, 1, 1, 1024, 10); - void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin", - float_type, 1, 10, 1, 1); - - - + void *input = + readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size); + + void *conv1_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5); + void *conv1_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1); + void *conv2_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5); + void *conv2_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1); + void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", + float_type, 1, 1, 7 * 7 * 64, 1024); + void *fc1_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1); + void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", + float_type, 1, 1, 1024, 10); + void *fc2_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1); + clearTensorMap(); - - for(int i = 0; i < total_runs; i++){ + + for (int i = 0; i < total_runs; i++) { readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters - // Start power and performnce profiling + // Start power and performnce profiling startProfiling(); - + int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum // NOTE: 'SAME' convolution - void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + void *conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision); - // NOTE: For tensorAdd, the only dimension that MUST match is channels + // NOTE: For tensorAdd, the only dimension that MUST match is channels tensorAdd(conv1out, conv1_bias); // NOTE: In place operation - void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + void *pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); - void* conv1_tanh = tensorTanh(pool1out); + void *conv1_tanh = tensorTanh(pool1out); - // NOTE: input channels have to match between tensor op inputs and outputs - void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + // NOTE: input channels have to match between tensor op inputs and outputs + void *conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision); tensorAdd(conv2out, conv2_bias); // NOTE: In place operation - void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + void *pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void *conv2_tanh = tensorTanh(pool2out); - void* conv2_tanh = tensorTanh(pool2out); + void *gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); - void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); + void *gemm1biasout = tensorAdd(gemm1out, fc1_bias); - void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void *tanh1out = tensorTanh(gemm1biasout); - void* tanh1out = tensorTanh(gemm1biasout); - - void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights); - - void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + void *gemm2out = tensorGemmGPU(tanh1out, fc2_weights); - void* tanh2out = tensorTanh(gemm2_biasout); - - void* result = tensorSoftmax(tanh2out); + void *gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + + void *tanh2out = tensorTanh(gemm2_biasout); + + void *result = tensorSoftmax(tanh2out); // End profiling and dump output to profile.txt stopProfiling(); - + float accuracy = computeAccuracy2(labels, test_batch_size, result); - dumpFinalAccuracy(accuracy); + dumpFinalAccuracy(accuracy); - - //FIXME: remove the comment below to use piped autotuner - //dumpAccuracyNorms(); - freeOutputTensors(); + // FIXME: remove the comment below to use piped autotuner + // dumpAccuracyNorms(); + freeOutputTensors(); } dumpExecutionAccuracies(); - - } +int main(int argc, char *argv[]) { - -int main(int argc, char* argv[]){ - - if (argc > 1){ + if (argc > 1) { total_runs = atoi(argv[1]); } @@ -130,4 +114,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc index 7c311a568647caa107112bed4982fb57254dc7b3..44336b02e0297f0ecbc37d3dccea8b97e766a357 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc @@ -1,414 +1,725 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> #include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" +#include "../include/utils.h" -int main(){ +int main() { - llvm_hpvm_initTensorRt(0); + llvm_hpvm_initTensorRt(0); + std::string dir_prefix = model_params_path + std::string("/mobilenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string depthwise_conv2d_1_w_path = + dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void *depthwise_conv2d_1_w = + readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string depthwise_conv2d_2_w_path = + dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void *depthwise_conv2d_2_w = + readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_3_w_path = + dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void *depthwise_conv2d_3_w = + readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_4_w_path = + dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void *depthwise_conv2d_4_w = + readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_5_w_path = + dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void *depthwise_conv2d_5_w = + readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_6_w_path = + dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void *depthwise_conv2d_6_w = + readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_7_w_path = + dir_prefix + std::string("depthwise_conv2d_7_w.bin"); + void *depthwise_conv2d_7_w = + readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_8_w_path = + dir_prefix + std::string("depthwise_conv2d_8_w.bin"); + void *depthwise_conv2d_8_w = + readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_9_w_path = + dir_prefix + std::string("depthwise_conv2d_9_w.bin"); + void *depthwise_conv2d_9_w = + readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_10_w_path = + dir_prefix + std::string("depthwise_conv2d_10_w.bin"); + void *depthwise_conv2d_10_w = + readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_11_w_path = + dir_prefix + std::string("depthwise_conv2d_11_w.bin"); + void *depthwise_conv2d_11_w = + readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_12_w_path = + dir_prefix + std::string("depthwise_conv2d_12_w.bin"); + void *depthwise_conv2d_12_w = + readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string depthwise_conv2d_13_w_path = + dir_prefix + std::string("depthwise_conv2d_13_w.bin"); + void *depthwise_conv2d_13_w = + readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); - std::string dir_prefix = model_params_path + std::string("/mobilenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); - std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); - void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); - void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); - void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); - void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); - void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); - void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_7_w_path = dir_prefix + std::string("depthwise_conv2d_7_w.bin"); - void* depthwise_conv2d_7_w = readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_8_w_path = dir_prefix + std::string("depthwise_conv2d_8_w.bin"); - void* depthwise_conv2d_8_w = readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_9_w_path = dir_prefix + std::string("depthwise_conv2d_9_w.bin"); - void* depthwise_conv2d_9_w = readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_10_w_path = dir_prefix + std::string("depthwise_conv2d_10_w.bin"); - void* depthwise_conv2d_10_w = readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_11_w_path = dir_prefix + std::string("depthwise_conv2d_11_w.bin"); - void* depthwise_conv2d_11_w = readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_12_w_path = dir_prefix + std::string("depthwise_conv2d_12_w.bin"); - void* depthwise_conv2d_12_w = readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); - std::string depthwise_conv2d_13_w_path = dir_prefix + std::string("depthwise_conv2d_13_w.bin"); - void* depthwise_conv2d_13_w = readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + startMemTracking(); + int test_input_size = 2000; + int batch_size = 2000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + for (int i = 0; i < batch_count; i++) { - startMemTracking(); + int start = i * batch_size; + int end = (i + 1) * batch_size; - int test_input_size = 2000; - int batch_size = 2000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - for(int i = 0; i < batch_count; i++){ + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_1 = tensorBatchNorm( + var_0, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_2 = tensorRelu(var_1); + void *var_4 = + tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void *var_5 = tensorBatchNorm( + var_4, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorBatchNorm( + var_7, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_9 = tensorRelu(var_8); + void *var_11 = + tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void *var_12 = tensorBatchNorm( + var_11, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void *var_15 = tensorBatchNorm( + var_14, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_16 = tensorRelu(var_15); + void *var_18 = + tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); + void *var_19 = tensorBatchNorm( + var_18, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_20 = tensorRelu(var_19); + void *var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_22 = tensorBatchNorm( + var_21, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_23 = tensorRelu(var_22); + void *var_26 = + tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void *var_27 = tensorBatchNorm( + var_26, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_28 = tensorRelu(var_27); + void *var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_30 = tensorBatchNorm( + var_29, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_31 = tensorRelu(var_30); + void *var_33 = + tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void *var_34 = tensorBatchNorm( + var_33, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_35 = tensorRelu(var_34); + void *var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorBatchNorm( + var_36, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_38 = tensorRelu(var_37); + void *var_41 = + tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); + void *var_42 = tensorBatchNorm( + var_41, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorBatchNorm( + var_44, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_46 = tensorRelu(var_45); + void *var_48 = + tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); + void *var_49 = tensorBatchNorm( + var_48, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_52 = tensorBatchNorm( + var_51, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_53 = tensorRelu(var_52); + void *var_55 = + tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); + void *var_56 = tensorBatchNorm( + var_55, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_57 = tensorRelu(var_56); + void *var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_59 = tensorBatchNorm( + var_58, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_60 = tensorRelu(var_59); + void *var_63 = + tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); + void *var_64 = tensorBatchNorm( + var_63, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_65 = tensorRelu(var_64); + void *var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); + void *var_67 = tensorBatchNorm( + var_66, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_68 = tensorRelu(var_67); + void *var_70 = + tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); + void *var_71 = tensorBatchNorm( + var_70, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_72 = tensorRelu(var_71); + void *var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorBatchNorm( + var_73, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_75 = tensorRelu(var_74); + void *var_77 = + tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); + void *var_78 = tensorBatchNorm( + var_77, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_79 = tensorRelu(var_78); + void *var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); + void *var_81 = tensorBatchNorm( + var_80, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_82 = tensorRelu(var_81); + void *var_85 = + tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); + void *var_86 = tensorBatchNorm( + var_85, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_87 = tensorRelu(var_86); + void *var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); + void *var_89 = tensorBatchNorm( + var_88, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_90 = tensorRelu(var_89); + void *var_92 = + tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); + void *var_93 = tensorBatchNorm( + var_92, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_94 = tensorRelu(var_93); + void *var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_96 = tensorBatchNorm( + var_95, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_97 = tensorRelu(var_96); + void *var_99 = tensorPooling(var_97, 1, 2, 2, 0, 0, 2, 2); + void *var_101 = tensorGemmGPU(var_99, dense_1_w); + void *var_102 = tensorAdd(var_101, dense_1_b); + void *var_103 = tensorSoftmax(var_102); - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); - void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_9 = tensorRelu(var_8); - void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); - void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); - void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_16 = tensorRelu(var_15); - void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); - void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_20 = tensorRelu(var_19); - void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_23 = tensorRelu(var_22); - void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); - void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_28 = tensorRelu(var_27); - void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_31 = tensorRelu(var_30); - void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); - void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_35 = tensorRelu(var_34); - void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_38 = tensorRelu(var_37); - void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); - void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); - void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_53 = tensorRelu(var_52); - void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); - void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_57 = tensorRelu(var_56); - void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_60 = tensorRelu(var_59); - void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); - void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_65 = tensorRelu(var_64); - void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); - void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_68 = tensorRelu(var_67); - void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); - void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_72 = tensorRelu(var_71); - void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_75 = tensorRelu(var_74); - void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); - void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_79 = tensorRelu(var_78); - void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); - void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_82 = tensorRelu(var_81); - void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); - void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_87 = tensorRelu(var_86); - void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); - void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_90 = tensorRelu(var_89); - void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); - void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_94 = tensorRelu(var_93); - void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_97 = tensorRelu(var_96); - void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); - void* var_101 = tensorGemmGPU(var_99, dense_1_w); - void* var_102 = tensorAdd(var_101, dense_1_b); - void* var_103 = tensorSoftmax(var_102); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_103); - final_accuracy += accuracy; - freeBatchMemory(); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + float accuracy = computeAccuracy2(labels, batch_size, var_103); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc index 87b8cd4156ed8d7f882ff7642420c995cd7c3a0f..a7355fb063b37a90ab04d077d1c1b32f26613857 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc @@ -1,112 +1,149 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(1); - - std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(1); + + std::string dir_prefix = + model_params_path + std::string("/resnet18_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + // void* input = readTrainedWeights(input_path.c_str(), 0, + // batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + // uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -117,94 +154,94 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_7 = tensorAdd(var_6, conv2d_2_b); - void* var_8 = tensorRelu(var_7); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorAdd(var_4, var_11); - void* var_13 = tensorRelu(var_12); - void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_16 = tensorAdd(var_15, conv2d_4_b); - void* var_17 = tensorRelu(var_16); - void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_20 = tensorAdd(var_19, conv2d_5_b); - void* var_21 = tensorAdd(var_13, var_20); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_6_b); - void* var_26 = tensorRelu(var_25); - void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_7_b); - void* var_30 = tensorAdd(var_22, var_29); - void* var_31 = tensorRelu(var_30); - void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); - void* var_34 = tensorAdd(var_33, conv2d_8_b); - void* var_35 = tensorRelu(var_34); - void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_38 = tensorAdd(var_37, conv2d_9_b); - void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_10_b); - void* var_42 = tensorAdd(var_41, var_38); - void* var_43 = tensorRelu(var_42); - void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_46 = tensorAdd(var_45, conv2d_11_b); - void* var_47 = tensorRelu(var_46); - void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_50 = tensorAdd(var_49, conv2d_12_b); - void* var_51 = tensorAdd(var_43, var_50); - void* var_52 = tensorRelu(var_51); - void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_55 = tensorAdd(var_54, conv2d_13_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); - void* var_59 = tensorAdd(var_58, conv2d_14_b); - void* var_60 = tensorAdd(var_52, var_59); - void* var_61 = tensorRelu(var_60); - void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); - void* var_64 = tensorAdd(var_63, conv2d_15_b); - void* var_65 = tensorRelu(var_64); - void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); - void* var_68 = tensorAdd(var_67, conv2d_16_b); - void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); - void* var_71 = tensorAdd(var_70, conv2d_17_b); - void* var_72 = tensorAdd(var_71, var_68); - void* var_73 = tensorRelu(var_72); - void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); - void* var_76 = tensorAdd(var_75, conv2d_18_b); - void* var_77 = tensorRelu(var_76); - void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); - void* var_80 = tensorAdd(var_79, conv2d_19_b); - void* var_81 = tensorAdd(var_73, var_80); - void* var_82 = tensorRelu(var_81); - void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); - void* var_85 = tensorAdd(var_84, conv2d_20_b); - void* var_86 = tensorRelu(var_85); - void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); - void* var_89 = tensorAdd(var_88, conv2d_21_b); - void* var_90 = tensorAdd(var_82, var_89); - void* var_91 = tensorRelu(var_90); - void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); - void* var_94 = tensorGemmGPU(var_92, dense_1_w); - void* var_95 = tensorAdd(var_94, dense_1_b); - void* var_96 = tensorSoftmax(var_95); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_96); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_7 = tensorAdd(var_6, conv2d_2_b); + void *var_8 = tensorRelu(var_7); + void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorAdd(var_10, conv2d_3_b); + void *var_12 = tensorAdd(var_4, var_11); + void *var_13 = tensorRelu(var_12); + void *var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_16 = tensorAdd(var_15, conv2d_4_b); + void *var_17 = tensorRelu(var_16); + void *var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_20 = tensorAdd(var_19, conv2d_5_b); + void *var_21 = tensorAdd(var_13, var_20); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_6_b); + void *var_26 = tensorRelu(var_25); + void *var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_7_b); + void *var_30 = tensorAdd(var_22, var_29); + void *var_31 = tensorRelu(var_30); + void *var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); + void *var_34 = tensorAdd(var_33, conv2d_8_b); + void *var_35 = tensorRelu(var_34); + void *var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_38 = tensorAdd(var_37, conv2d_9_b); + void *var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_10_b); + void *var_42 = tensorAdd(var_41, var_38); + void *var_43 = tensorRelu(var_42); + void *var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_46 = tensorAdd(var_45, conv2d_11_b); + void *var_47 = tensorRelu(var_46); + void *var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_50 = tensorAdd(var_49, conv2d_12_b); + void *var_51 = tensorAdd(var_43, var_50); + void *var_52 = tensorRelu(var_51); + void *var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_55 = tensorAdd(var_54, conv2d_13_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); + void *var_59 = tensorAdd(var_58, conv2d_14_b); + void *var_60 = tensorAdd(var_52, var_59); + void *var_61 = tensorRelu(var_60); + void *var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); + void *var_64 = tensorAdd(var_63, conv2d_15_b); + void *var_65 = tensorRelu(var_64); + void *var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); + void *var_68 = tensorAdd(var_67, conv2d_16_b); + void *var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); + void *var_71 = tensorAdd(var_70, conv2d_17_b); + void *var_72 = tensorAdd(var_71, var_68); + void *var_73 = tensorRelu(var_72); + void *var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); + void *var_76 = tensorAdd(var_75, conv2d_18_b); + void *var_77 = tensorRelu(var_76); + void *var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); + void *var_80 = tensorAdd(var_79, conv2d_19_b); + void *var_81 = tensorAdd(var_73, var_80); + void *var_82 = tensorRelu(var_81); + void *var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); + void *var_85 = tensorAdd(var_84, conv2d_20_b); + void *var_86 = tensorRelu(var_85); + void *var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); + void *var_89 = tensorAdd(var_88, conv2d_21_b); + void *var_90 = tensorAdd(var_82, var_89); + void *var_91 = tensorRelu(var_90); + void *var_92 = tensorPooling(var_91, 1, 8, 8, 0, 0, 8, 8); + void *var_94 = tensorGemmGPU(var_92, dense_1_w); + void *var_95 = tensorAdd(var_94, dense_1_b); + void *var_96 = tensorSoftmax(var_95); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_96); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -213,9 +250,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc index 0914b3f70c353ee7e56c39ccf52f21914618301e..afa3f0bcc1b08fc4a89c694e8e07e813b352ccbf 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc @@ -1,924 +1,1551 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" -int main(){ +int main() { - llvm_hpvm_initTensorRt(0); + llvm_hpvm_initTensorRt(0); + std::string dir_prefix = + model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 1, 1); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 64, 256, 1, 1); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 64, 256, 1, 1); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 128, 256, 1, 1); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 512, 256, 1, 1); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin"); + void *conv2d_22_w = + readTrainedWeights(conv2d_22_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin"); + void *conv2d_22_b = + readTrainedWeights(conv2d_22_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin"); + void *conv2d_23_w = + readTrainedWeights(conv2d_23_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin"); + void *conv2d_23_b = + readTrainedWeights(conv2d_23_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin"); + void *conv2d_24_w = + readTrainedWeights(conv2d_24_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin"); + void *conv2d_24_b = + readTrainedWeights(conv2d_24_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin"); + void *conv2d_25_w = + readTrainedWeights(conv2d_25_w_path.c_str(), 0, 256, 512, 1, 1); + std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin"); + void *conv2d_25_b = + readTrainedWeights(conv2d_25_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin"); + void *conv2d_26_w = + readTrainedWeights(conv2d_26_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin"); + void *conv2d_26_b = + readTrainedWeights(conv2d_26_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin"); + void *conv2d_27_w = + readTrainedWeights(conv2d_27_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin"); + void *conv2d_27_b = + readTrainedWeights(conv2d_27_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin"); + void *conv2d_28_w = + readTrainedWeights(conv2d_28_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin"); + void *conv2d_28_b = + readTrainedWeights(conv2d_28_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_gamma_path = + dir_prefix + std::string("batch_normalization_28_gamma.bin"); + void *batch_normalization_28_gamma = readTrainedWeights( + batch_normalization_28_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_beta_path = + dir_prefix + std::string("batch_normalization_28_beta.bin"); + void *batch_normalization_28_beta = readTrainedWeights( + batch_normalization_28_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_mean_path = + dir_prefix + std::string("batch_normalization_28_mean.bin"); + void *batch_normalization_28_mean = readTrainedWeights( + batch_normalization_28_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_variance_path = + dir_prefix + std::string("batch_normalization_28_variance.bin"); + void *batch_normalization_28_variance = readTrainedWeights( + batch_normalization_28_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin"); + void *conv2d_29_w = + readTrainedWeights(conv2d_29_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin"); + void *conv2d_29_b = + readTrainedWeights(conv2d_29_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_gamma_path = + dir_prefix + std::string("batch_normalization_29_gamma.bin"); + void *batch_normalization_29_gamma = readTrainedWeights( + batch_normalization_29_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_beta_path = + dir_prefix + std::string("batch_normalization_29_beta.bin"); + void *batch_normalization_29_beta = readTrainedWeights( + batch_normalization_29_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_mean_path = + dir_prefix + std::string("batch_normalization_29_mean.bin"); + void *batch_normalization_29_mean = readTrainedWeights( + batch_normalization_29_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_variance_path = + dir_prefix + std::string("batch_normalization_29_variance.bin"); + void *batch_normalization_29_variance = readTrainedWeights( + batch_normalization_29_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin"); + void *conv2d_30_w = + readTrainedWeights(conv2d_30_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin"); + void *conv2d_30_b = + readTrainedWeights(conv2d_30_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_gamma_path = + dir_prefix + std::string("batch_normalization_30_gamma.bin"); + void *batch_normalization_30_gamma = readTrainedWeights( + batch_normalization_30_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_beta_path = + dir_prefix + std::string("batch_normalization_30_beta.bin"); + void *batch_normalization_30_beta = readTrainedWeights( + batch_normalization_30_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_mean_path = + dir_prefix + std::string("batch_normalization_30_mean.bin"); + void *batch_normalization_30_mean = readTrainedWeights( + batch_normalization_30_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_variance_path = + dir_prefix + std::string("batch_normalization_30_variance.bin"); + void *batch_normalization_30_variance = readTrainedWeights( + batch_normalization_30_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin"); + void *conv2d_31_w = + readTrainedWeights(conv2d_31_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin"); + void *conv2d_31_b = + readTrainedWeights(conv2d_31_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_gamma_path = + dir_prefix + std::string("batch_normalization_31_gamma.bin"); + void *batch_normalization_31_gamma = readTrainedWeights( + batch_normalization_31_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_beta_path = + dir_prefix + std::string("batch_normalization_31_beta.bin"); + void *batch_normalization_31_beta = readTrainedWeights( + batch_normalization_31_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_mean_path = + dir_prefix + std::string("batch_normalization_31_mean.bin"); + void *batch_normalization_31_mean = readTrainedWeights( + batch_normalization_31_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_variance_path = + dir_prefix + std::string("batch_normalization_31_variance.bin"); + void *batch_normalization_31_variance = readTrainedWeights( + batch_normalization_31_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin"); + void *conv2d_32_w = + readTrainedWeights(conv2d_32_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin"); + void *conv2d_32_b = + readTrainedWeights(conv2d_32_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_gamma_path = + dir_prefix + std::string("batch_normalization_32_gamma.bin"); + void *batch_normalization_32_gamma = readTrainedWeights( + batch_normalization_32_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_beta_path = + dir_prefix + std::string("batch_normalization_32_beta.bin"); + void *batch_normalization_32_beta = readTrainedWeights( + batch_normalization_32_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_mean_path = + dir_prefix + std::string("batch_normalization_32_mean.bin"); + void *batch_normalization_32_mean = readTrainedWeights( + batch_normalization_32_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_variance_path = + dir_prefix + std::string("batch_normalization_32_variance.bin"); + void *batch_normalization_32_variance = readTrainedWeights( + batch_normalization_32_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin"); + void *conv2d_33_w = + readTrainedWeights(conv2d_33_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin"); + void *conv2d_33_b = + readTrainedWeights(conv2d_33_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_gamma_path = + dir_prefix + std::string("batch_normalization_33_gamma.bin"); + void *batch_normalization_33_gamma = readTrainedWeights( + batch_normalization_33_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_beta_path = + dir_prefix + std::string("batch_normalization_33_beta.bin"); + void *batch_normalization_33_beta = readTrainedWeights( + batch_normalization_33_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_mean_path = + dir_prefix + std::string("batch_normalization_33_mean.bin"); + void *batch_normalization_33_mean = readTrainedWeights( + batch_normalization_33_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_variance_path = + dir_prefix + std::string("batch_normalization_33_variance.bin"); + void *batch_normalization_33_variance = readTrainedWeights( + batch_normalization_33_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin"); + void *conv2d_34_w = + readTrainedWeights(conv2d_34_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin"); + void *conv2d_34_b = + readTrainedWeights(conv2d_34_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_gamma_path = + dir_prefix + std::string("batch_normalization_34_gamma.bin"); + void *batch_normalization_34_gamma = readTrainedWeights( + batch_normalization_34_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_beta_path = + dir_prefix + std::string("batch_normalization_34_beta.bin"); + void *batch_normalization_34_beta = readTrainedWeights( + batch_normalization_34_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_mean_path = + dir_prefix + std::string("batch_normalization_34_mean.bin"); + void *batch_normalization_34_mean = readTrainedWeights( + batch_normalization_34_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_variance_path = + dir_prefix + std::string("batch_normalization_34_variance.bin"); + void *batch_normalization_34_variance = readTrainedWeights( + batch_normalization_34_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin"); + void *conv2d_35_w = + readTrainedWeights(conv2d_35_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin"); + void *conv2d_35_b = + readTrainedWeights(conv2d_35_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_gamma_path = + dir_prefix + std::string("batch_normalization_35_gamma.bin"); + void *batch_normalization_35_gamma = readTrainedWeights( + batch_normalization_35_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_beta_path = + dir_prefix + std::string("batch_normalization_35_beta.bin"); + void *batch_normalization_35_beta = readTrainedWeights( + batch_normalization_35_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_mean_path = + dir_prefix + std::string("batch_normalization_35_mean.bin"); + void *batch_normalization_35_mean = readTrainedWeights( + batch_normalization_35_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_variance_path = + dir_prefix + std::string("batch_normalization_35_variance.bin"); + void *batch_normalization_35_variance = readTrainedWeights( + batch_normalization_35_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin"); + void *conv2d_36_w = + readTrainedWeights(conv2d_36_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin"); + void *conv2d_36_b = + readTrainedWeights(conv2d_36_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_gamma_path = + dir_prefix + std::string("batch_normalization_36_gamma.bin"); + void *batch_normalization_36_gamma = readTrainedWeights( + batch_normalization_36_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_beta_path = + dir_prefix + std::string("batch_normalization_36_beta.bin"); + void *batch_normalization_36_beta = readTrainedWeights( + batch_normalization_36_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_mean_path = + dir_prefix + std::string("batch_normalization_36_mean.bin"); + void *batch_normalization_36_mean = readTrainedWeights( + batch_normalization_36_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_variance_path = + dir_prefix + std::string("batch_normalization_36_variance.bin"); + void *batch_normalization_36_variance = readTrainedWeights( + batch_normalization_36_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin"); + void *conv2d_37_w = + readTrainedWeights(conv2d_37_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin"); + void *conv2d_37_b = + readTrainedWeights(conv2d_37_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_gamma_path = + dir_prefix + std::string("batch_normalization_37_gamma.bin"); + void *batch_normalization_37_gamma = readTrainedWeights( + batch_normalization_37_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_beta_path = + dir_prefix + std::string("batch_normalization_37_beta.bin"); + void *batch_normalization_37_beta = readTrainedWeights( + batch_normalization_37_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_mean_path = + dir_prefix + std::string("batch_normalization_37_mean.bin"); + void *batch_normalization_37_mean = readTrainedWeights( + batch_normalization_37_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_variance_path = + dir_prefix + std::string("batch_normalization_37_variance.bin"); + void *batch_normalization_37_variance = readTrainedWeights( + batch_normalization_37_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin"); + void *conv2d_38_w = + readTrainedWeights(conv2d_38_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin"); + void *conv2d_38_b = + readTrainedWeights(conv2d_38_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_gamma_path = + dir_prefix + std::string("batch_normalization_38_gamma.bin"); + void *batch_normalization_38_gamma = readTrainedWeights( + batch_normalization_38_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_beta_path = + dir_prefix + std::string("batch_normalization_38_beta.bin"); + void *batch_normalization_38_beta = readTrainedWeights( + batch_normalization_38_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_mean_path = + dir_prefix + std::string("batch_normalization_38_mean.bin"); + void *batch_normalization_38_mean = readTrainedWeights( + batch_normalization_38_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_variance_path = + dir_prefix + std::string("batch_normalization_38_variance.bin"); + void *batch_normalization_38_variance = readTrainedWeights( + batch_normalization_38_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin"); + void *conv2d_39_w = + readTrainedWeights(conv2d_39_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin"); + void *conv2d_39_b = + readTrainedWeights(conv2d_39_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_gamma_path = + dir_prefix + std::string("batch_normalization_39_gamma.bin"); + void *batch_normalization_39_gamma = readTrainedWeights( + batch_normalization_39_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_beta_path = + dir_prefix + std::string("batch_normalization_39_beta.bin"); + void *batch_normalization_39_beta = readTrainedWeights( + batch_normalization_39_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_mean_path = + dir_prefix + std::string("batch_normalization_39_mean.bin"); + void *batch_normalization_39_mean = readTrainedWeights( + batch_normalization_39_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_variance_path = + dir_prefix + std::string("batch_normalization_39_variance.bin"); + void *batch_normalization_39_variance = readTrainedWeights( + batch_normalization_39_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin"); + void *conv2d_40_w = + readTrainedWeights(conv2d_40_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin"); + void *conv2d_40_b = + readTrainedWeights(conv2d_40_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_gamma_path = + dir_prefix + std::string("batch_normalization_40_gamma.bin"); + void *batch_normalization_40_gamma = readTrainedWeights( + batch_normalization_40_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_beta_path = + dir_prefix + std::string("batch_normalization_40_beta.bin"); + void *batch_normalization_40_beta = readTrainedWeights( + batch_normalization_40_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_mean_path = + dir_prefix + std::string("batch_normalization_40_mean.bin"); + void *batch_normalization_40_mean = readTrainedWeights( + batch_normalization_40_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_variance_path = + dir_prefix + std::string("batch_normalization_40_variance.bin"); + void *batch_normalization_40_variance = readTrainedWeights( + batch_normalization_40_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin"); + void *conv2d_41_w = + readTrainedWeights(conv2d_41_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin"); + void *conv2d_41_b = + readTrainedWeights(conv2d_41_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_gamma_path = + dir_prefix + std::string("batch_normalization_41_gamma.bin"); + void *batch_normalization_41_gamma = readTrainedWeights( + batch_normalization_41_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_beta_path = + dir_prefix + std::string("batch_normalization_41_beta.bin"); + void *batch_normalization_41_beta = readTrainedWeights( + batch_normalization_41_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_mean_path = + dir_prefix + std::string("batch_normalization_41_mean.bin"); + void *batch_normalization_41_mean = readTrainedWeights( + batch_normalization_41_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_variance_path = + dir_prefix + std::string("batch_normalization_41_variance.bin"); + void *batch_normalization_41_variance = readTrainedWeights( + batch_normalization_41_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin"); + void *conv2d_42_w = + readTrainedWeights(conv2d_42_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin"); + void *conv2d_42_b = + readTrainedWeights(conv2d_42_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_gamma_path = + dir_prefix + std::string("batch_normalization_42_gamma.bin"); + void *batch_normalization_42_gamma = readTrainedWeights( + batch_normalization_42_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_beta_path = + dir_prefix + std::string("batch_normalization_42_beta.bin"); + void *batch_normalization_42_beta = readTrainedWeights( + batch_normalization_42_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_mean_path = + dir_prefix + std::string("batch_normalization_42_mean.bin"); + void *batch_normalization_42_mean = readTrainedWeights( + batch_normalization_42_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_variance_path = + dir_prefix + std::string("batch_normalization_42_variance.bin"); + void *batch_normalization_42_variance = readTrainedWeights( + batch_normalization_42_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin"); + void *conv2d_43_w = + readTrainedWeights(conv2d_43_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin"); + void *conv2d_43_b = + readTrainedWeights(conv2d_43_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_gamma_path = + dir_prefix + std::string("batch_normalization_43_gamma.bin"); + void *batch_normalization_43_gamma = readTrainedWeights( + batch_normalization_43_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_beta_path = + dir_prefix + std::string("batch_normalization_43_beta.bin"); + void *batch_normalization_43_beta = readTrainedWeights( + batch_normalization_43_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_mean_path = + dir_prefix + std::string("batch_normalization_43_mean.bin"); + void *batch_normalization_43_mean = readTrainedWeights( + batch_normalization_43_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_variance_path = + dir_prefix + std::string("batch_normalization_43_variance.bin"); + void *batch_normalization_43_variance = readTrainedWeights( + batch_normalization_43_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin"); + void *conv2d_44_w = + readTrainedWeights(conv2d_44_w_path.c_str(), 0, 512, 1024, 1, 1); + std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin"); + void *conv2d_44_b = + readTrainedWeights(conv2d_44_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_gamma_path = + dir_prefix + std::string("batch_normalization_44_gamma.bin"); + void *batch_normalization_44_gamma = readTrainedWeights( + batch_normalization_44_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_beta_path = + dir_prefix + std::string("batch_normalization_44_beta.bin"); + void *batch_normalization_44_beta = readTrainedWeights( + batch_normalization_44_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_mean_path = + dir_prefix + std::string("batch_normalization_44_mean.bin"); + void *batch_normalization_44_mean = readTrainedWeights( + batch_normalization_44_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_variance_path = + dir_prefix + std::string("batch_normalization_44_variance.bin"); + void *batch_normalization_44_variance = readTrainedWeights( + batch_normalization_44_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin"); + void *conv2d_45_w = + readTrainedWeights(conv2d_45_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin"); + void *conv2d_45_b = + readTrainedWeights(conv2d_45_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_gamma_path = + dir_prefix + std::string("batch_normalization_45_gamma.bin"); + void *batch_normalization_45_gamma = readTrainedWeights( + batch_normalization_45_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_beta_path = + dir_prefix + std::string("batch_normalization_45_beta.bin"); + void *batch_normalization_45_beta = readTrainedWeights( + batch_normalization_45_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_mean_path = + dir_prefix + std::string("batch_normalization_45_mean.bin"); + void *batch_normalization_45_mean = readTrainedWeights( + batch_normalization_45_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_variance_path = + dir_prefix + std::string("batch_normalization_45_variance.bin"); + void *batch_normalization_45_variance = readTrainedWeights( + batch_normalization_45_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin"); + void *conv2d_46_w = + readTrainedWeights(conv2d_46_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin"); + void *conv2d_46_b = + readTrainedWeights(conv2d_46_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin"); + void *conv2d_47_w = + readTrainedWeights(conv2d_47_w_path.c_str(), 0, 2048, 1024, 1, 1); + std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin"); + void *conv2d_47_b = + readTrainedWeights(conv2d_47_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_gamma_path = + dir_prefix + std::string("batch_normalization_46_gamma.bin"); + void *batch_normalization_46_gamma = readTrainedWeights( + batch_normalization_46_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_beta_path = + dir_prefix + std::string("batch_normalization_46_beta.bin"); + void *batch_normalization_46_beta = readTrainedWeights( + batch_normalization_46_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_mean_path = + dir_prefix + std::string("batch_normalization_46_mean.bin"); + void *batch_normalization_46_mean = readTrainedWeights( + batch_normalization_46_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_variance_path = + dir_prefix + std::string("batch_normalization_46_variance.bin"); + void *batch_normalization_46_variance = readTrainedWeights( + batch_normalization_46_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_gamma_path = + dir_prefix + std::string("batch_normalization_47_gamma.bin"); + void *batch_normalization_47_gamma = readTrainedWeights( + batch_normalization_47_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_beta_path = + dir_prefix + std::string("batch_normalization_47_beta.bin"); + void *batch_normalization_47_beta = readTrainedWeights( + batch_normalization_47_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_mean_path = + dir_prefix + std::string("batch_normalization_47_mean.bin"); + void *batch_normalization_47_mean = readTrainedWeights( + batch_normalization_47_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_variance_path = + dir_prefix + std::string("batch_normalization_47_variance.bin"); + void *batch_normalization_47_variance = readTrainedWeights( + batch_normalization_47_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin"); + void *conv2d_48_w = + readTrainedWeights(conv2d_48_w_path.c_str(), 0, 512, 2048, 1, 1); + std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin"); + void *conv2d_48_b = + readTrainedWeights(conv2d_48_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_gamma_path = + dir_prefix + std::string("batch_normalization_48_gamma.bin"); + void *batch_normalization_48_gamma = readTrainedWeights( + batch_normalization_48_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_beta_path = + dir_prefix + std::string("batch_normalization_48_beta.bin"); + void *batch_normalization_48_beta = readTrainedWeights( + batch_normalization_48_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_mean_path = + dir_prefix + std::string("batch_normalization_48_mean.bin"); + void *batch_normalization_48_mean = readTrainedWeights( + batch_normalization_48_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_variance_path = + dir_prefix + std::string("batch_normalization_48_variance.bin"); + void *batch_normalization_48_variance = readTrainedWeights( + batch_normalization_48_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin"); + void *conv2d_49_w = + readTrainedWeights(conv2d_49_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin"); + void *conv2d_49_b = + readTrainedWeights(conv2d_49_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_gamma_path = + dir_prefix + std::string("batch_normalization_49_gamma.bin"); + void *batch_normalization_49_gamma = readTrainedWeights( + batch_normalization_49_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_beta_path = + dir_prefix + std::string("batch_normalization_49_beta.bin"); + void *batch_normalization_49_beta = readTrainedWeights( + batch_normalization_49_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_mean_path = + dir_prefix + std::string("batch_normalization_49_mean.bin"); + void *batch_normalization_49_mean = readTrainedWeights( + batch_normalization_49_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_variance_path = + dir_prefix + std::string("batch_normalization_49_variance.bin"); + void *batch_normalization_49_variance = readTrainedWeights( + batch_normalization_49_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin"); + void *conv2d_50_w = + readTrainedWeights(conv2d_50_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin"); + void *conv2d_50_b = + readTrainedWeights(conv2d_50_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_gamma_path = + dir_prefix + std::string("batch_normalization_50_gamma.bin"); + void *batch_normalization_50_gamma = readTrainedWeights( + batch_normalization_50_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_beta_path = + dir_prefix + std::string("batch_normalization_50_beta.bin"); + void *batch_normalization_50_beta = readTrainedWeights( + batch_normalization_50_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_mean_path = + dir_prefix + std::string("batch_normalization_50_mean.bin"); + void *batch_normalization_50_mean = readTrainedWeights( + batch_normalization_50_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_variance_path = + dir_prefix + std::string("batch_normalization_50_variance.bin"); + void *batch_normalization_50_variance = readTrainedWeights( + batch_normalization_50_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin"); + void *conv2d_51_w = + readTrainedWeights(conv2d_51_w_path.c_str(), 0, 512, 2048, 1, 1); + std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin"); + void *conv2d_51_b = + readTrainedWeights(conv2d_51_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_gamma_path = + dir_prefix + std::string("batch_normalization_51_gamma.bin"); + void *batch_normalization_51_gamma = readTrainedWeights( + batch_normalization_51_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_beta_path = + dir_prefix + std::string("batch_normalization_51_beta.bin"); + void *batch_normalization_51_beta = readTrainedWeights( + batch_normalization_51_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_mean_path = + dir_prefix + std::string("batch_normalization_51_mean.bin"); + void *batch_normalization_51_mean = readTrainedWeights( + batch_normalization_51_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_variance_path = + dir_prefix + std::string("batch_normalization_51_variance.bin"); + void *batch_normalization_51_variance = readTrainedWeights( + batch_normalization_51_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin"); + void *conv2d_52_w = + readTrainedWeights(conv2d_52_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin"); + void *conv2d_52_b = + readTrainedWeights(conv2d_52_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_gamma_path = + dir_prefix + std::string("batch_normalization_52_gamma.bin"); + void *batch_normalization_52_gamma = readTrainedWeights( + batch_normalization_52_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_beta_path = + dir_prefix + std::string("batch_normalization_52_beta.bin"); + void *batch_normalization_52_beta = readTrainedWeights( + batch_normalization_52_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_mean_path = + dir_prefix + std::string("batch_normalization_52_mean.bin"); + void *batch_normalization_52_mean = readTrainedWeights( + batch_normalization_52_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_variance_path = + dir_prefix + std::string("batch_normalization_52_variance.bin"); + void *batch_normalization_52_variance = readTrainedWeights( + batch_normalization_52_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin"); + void *conv2d_53_w = + readTrainedWeights(conv2d_53_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin"); + void *conv2d_53_b = + readTrainedWeights(conv2d_53_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_gamma_path = + dir_prefix + std::string("batch_normalization_53_gamma.bin"); + void *batch_normalization_53_gamma = readTrainedWeights( + batch_normalization_53_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_beta_path = + dir_prefix + std::string("batch_normalization_53_beta.bin"); + void *batch_normalization_53_beta = readTrainedWeights( + batch_normalization_53_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_mean_path = + dir_prefix + std::string("batch_normalization_53_mean.bin"); + void *batch_normalization_53_mean = readTrainedWeights( + batch_normalization_53_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_variance_path = + dir_prefix + std::string("batch_normalization_53_variance.bin"); + void *batch_normalization_53_variance = readTrainedWeights( + batch_normalization_53_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 1000); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1); - std::string dir_prefix = model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,7,7); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,1,1); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,64,256,1,1); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,64,256,1,1); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,128,256,1,1); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,512,256,1,1); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin"); - void* conv2d_22_w = readTrainedWeights(conv2d_22_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin"); - void* conv2d_22_b = readTrainedWeights(conv2d_22_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin"); - void* conv2d_23_w = readTrainedWeights(conv2d_23_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin"); - void* conv2d_23_b = readTrainedWeights(conv2d_23_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin"); - void* conv2d_24_w = readTrainedWeights(conv2d_24_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin"); - void* conv2d_24_b = readTrainedWeights(conv2d_24_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin"); - void* conv2d_25_w = readTrainedWeights(conv2d_25_w_path.c_str(), 0,256,512,1,1); - std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin"); - void* conv2d_25_b = readTrainedWeights(conv2d_25_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin"); - void* conv2d_26_w = readTrainedWeights(conv2d_26_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin"); - void* conv2d_26_b = readTrainedWeights(conv2d_26_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin"); - void* conv2d_27_w = readTrainedWeights(conv2d_27_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin"); - void* conv2d_27_b = readTrainedWeights(conv2d_27_b_path.c_str(), 0,1,1024,1,1); - std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin"); - void* conv2d_28_w = readTrainedWeights(conv2d_28_w_path.c_str(), 0,1024,512,1,1); - std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin"); - void* conv2d_28_b = readTrainedWeights(conv2d_28_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_gamma_path = dir_prefix + std::string("batch_normalization_28_gamma.bin"); - void* batch_normalization_28_gamma = readTrainedWeights(batch_normalization_28_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_beta_path = dir_prefix + std::string("batch_normalization_28_beta.bin"); - void* batch_normalization_28_beta = readTrainedWeights(batch_normalization_28_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_mean_path = dir_prefix + std::string("batch_normalization_28_mean.bin"); - void* batch_normalization_28_mean = readTrainedWeights(batch_normalization_28_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_variance_path = dir_prefix + std::string("batch_normalization_28_variance.bin"); - void* batch_normalization_28_variance = readTrainedWeights(batch_normalization_28_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin"); - void* conv2d_29_w = readTrainedWeights(conv2d_29_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin"); - void* conv2d_29_b = readTrainedWeights(conv2d_29_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_gamma_path = dir_prefix + std::string("batch_normalization_29_gamma.bin"); - void* batch_normalization_29_gamma = readTrainedWeights(batch_normalization_29_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_beta_path = dir_prefix + std::string("batch_normalization_29_beta.bin"); - void* batch_normalization_29_beta = readTrainedWeights(batch_normalization_29_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_mean_path = dir_prefix + std::string("batch_normalization_29_mean.bin"); - void* batch_normalization_29_mean = readTrainedWeights(batch_normalization_29_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_variance_path = dir_prefix + std::string("batch_normalization_29_variance.bin"); - void* batch_normalization_29_variance = readTrainedWeights(batch_normalization_29_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin"); - void* conv2d_30_w = readTrainedWeights(conv2d_30_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin"); - void* conv2d_30_b = readTrainedWeights(conv2d_30_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_gamma_path = dir_prefix + std::string("batch_normalization_30_gamma.bin"); - void* batch_normalization_30_gamma = readTrainedWeights(batch_normalization_30_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_beta_path = dir_prefix + std::string("batch_normalization_30_beta.bin"); - void* batch_normalization_30_beta = readTrainedWeights(batch_normalization_30_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_mean_path = dir_prefix + std::string("batch_normalization_30_mean.bin"); - void* batch_normalization_30_mean = readTrainedWeights(batch_normalization_30_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_variance_path = dir_prefix + std::string("batch_normalization_30_variance.bin"); - void* batch_normalization_30_variance = readTrainedWeights(batch_normalization_30_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin"); - void* conv2d_31_w = readTrainedWeights(conv2d_31_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin"); - void* conv2d_31_b = readTrainedWeights(conv2d_31_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_gamma_path = dir_prefix + std::string("batch_normalization_31_gamma.bin"); - void* batch_normalization_31_gamma = readTrainedWeights(batch_normalization_31_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_beta_path = dir_prefix + std::string("batch_normalization_31_beta.bin"); - void* batch_normalization_31_beta = readTrainedWeights(batch_normalization_31_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_mean_path = dir_prefix + std::string("batch_normalization_31_mean.bin"); - void* batch_normalization_31_mean = readTrainedWeights(batch_normalization_31_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_variance_path = dir_prefix + std::string("batch_normalization_31_variance.bin"); - void* batch_normalization_31_variance = readTrainedWeights(batch_normalization_31_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin"); - void* conv2d_32_w = readTrainedWeights(conv2d_32_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin"); - void* conv2d_32_b = readTrainedWeights(conv2d_32_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_gamma_path = dir_prefix + std::string("batch_normalization_32_gamma.bin"); - void* batch_normalization_32_gamma = readTrainedWeights(batch_normalization_32_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_beta_path = dir_prefix + std::string("batch_normalization_32_beta.bin"); - void* batch_normalization_32_beta = readTrainedWeights(batch_normalization_32_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_mean_path = dir_prefix + std::string("batch_normalization_32_mean.bin"); - void* batch_normalization_32_mean = readTrainedWeights(batch_normalization_32_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_variance_path = dir_prefix + std::string("batch_normalization_32_variance.bin"); - void* batch_normalization_32_variance = readTrainedWeights(batch_normalization_32_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin"); - void* conv2d_33_w = readTrainedWeights(conv2d_33_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin"); - void* conv2d_33_b = readTrainedWeights(conv2d_33_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_gamma_path = dir_prefix + std::string("batch_normalization_33_gamma.bin"); - void* batch_normalization_33_gamma = readTrainedWeights(batch_normalization_33_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_beta_path = dir_prefix + std::string("batch_normalization_33_beta.bin"); - void* batch_normalization_33_beta = readTrainedWeights(batch_normalization_33_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_mean_path = dir_prefix + std::string("batch_normalization_33_mean.bin"); - void* batch_normalization_33_mean = readTrainedWeights(batch_normalization_33_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_variance_path = dir_prefix + std::string("batch_normalization_33_variance.bin"); - void* batch_normalization_33_variance = readTrainedWeights(batch_normalization_33_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin"); - void* conv2d_34_w = readTrainedWeights(conv2d_34_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin"); - void* conv2d_34_b = readTrainedWeights(conv2d_34_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_gamma_path = dir_prefix + std::string("batch_normalization_34_gamma.bin"); - void* batch_normalization_34_gamma = readTrainedWeights(batch_normalization_34_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_beta_path = dir_prefix + std::string("batch_normalization_34_beta.bin"); - void* batch_normalization_34_beta = readTrainedWeights(batch_normalization_34_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_mean_path = dir_prefix + std::string("batch_normalization_34_mean.bin"); - void* batch_normalization_34_mean = readTrainedWeights(batch_normalization_34_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_variance_path = dir_prefix + std::string("batch_normalization_34_variance.bin"); - void* batch_normalization_34_variance = readTrainedWeights(batch_normalization_34_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin"); - void* conv2d_35_w = readTrainedWeights(conv2d_35_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin"); - void* conv2d_35_b = readTrainedWeights(conv2d_35_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_gamma_path = dir_prefix + std::string("batch_normalization_35_gamma.bin"); - void* batch_normalization_35_gamma = readTrainedWeights(batch_normalization_35_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_beta_path = dir_prefix + std::string("batch_normalization_35_beta.bin"); - void* batch_normalization_35_beta = readTrainedWeights(batch_normalization_35_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_mean_path = dir_prefix + std::string("batch_normalization_35_mean.bin"); - void* batch_normalization_35_mean = readTrainedWeights(batch_normalization_35_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_variance_path = dir_prefix + std::string("batch_normalization_35_variance.bin"); - void* batch_normalization_35_variance = readTrainedWeights(batch_normalization_35_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin"); - void* conv2d_36_w = readTrainedWeights(conv2d_36_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin"); - void* conv2d_36_b = readTrainedWeights(conv2d_36_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_gamma_path = dir_prefix + std::string("batch_normalization_36_gamma.bin"); - void* batch_normalization_36_gamma = readTrainedWeights(batch_normalization_36_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_beta_path = dir_prefix + std::string("batch_normalization_36_beta.bin"); - void* batch_normalization_36_beta = readTrainedWeights(batch_normalization_36_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_mean_path = dir_prefix + std::string("batch_normalization_36_mean.bin"); - void* batch_normalization_36_mean = readTrainedWeights(batch_normalization_36_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_variance_path = dir_prefix + std::string("batch_normalization_36_variance.bin"); - void* batch_normalization_36_variance = readTrainedWeights(batch_normalization_36_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin"); - void* conv2d_37_w = readTrainedWeights(conv2d_37_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin"); - void* conv2d_37_b = readTrainedWeights(conv2d_37_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_gamma_path = dir_prefix + std::string("batch_normalization_37_gamma.bin"); - void* batch_normalization_37_gamma = readTrainedWeights(batch_normalization_37_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_beta_path = dir_prefix + std::string("batch_normalization_37_beta.bin"); - void* batch_normalization_37_beta = readTrainedWeights(batch_normalization_37_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_mean_path = dir_prefix + std::string("batch_normalization_37_mean.bin"); - void* batch_normalization_37_mean = readTrainedWeights(batch_normalization_37_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_variance_path = dir_prefix + std::string("batch_normalization_37_variance.bin"); - void* batch_normalization_37_variance = readTrainedWeights(batch_normalization_37_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin"); - void* conv2d_38_w = readTrainedWeights(conv2d_38_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin"); - void* conv2d_38_b = readTrainedWeights(conv2d_38_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_gamma_path = dir_prefix + std::string("batch_normalization_38_gamma.bin"); - void* batch_normalization_38_gamma = readTrainedWeights(batch_normalization_38_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_beta_path = dir_prefix + std::string("batch_normalization_38_beta.bin"); - void* batch_normalization_38_beta = readTrainedWeights(batch_normalization_38_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_mean_path = dir_prefix + std::string("batch_normalization_38_mean.bin"); - void* batch_normalization_38_mean = readTrainedWeights(batch_normalization_38_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_variance_path = dir_prefix + std::string("batch_normalization_38_variance.bin"); - void* batch_normalization_38_variance = readTrainedWeights(batch_normalization_38_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin"); - void* conv2d_39_w = readTrainedWeights(conv2d_39_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin"); - void* conv2d_39_b = readTrainedWeights(conv2d_39_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_gamma_path = dir_prefix + std::string("batch_normalization_39_gamma.bin"); - void* batch_normalization_39_gamma = readTrainedWeights(batch_normalization_39_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_beta_path = dir_prefix + std::string("batch_normalization_39_beta.bin"); - void* batch_normalization_39_beta = readTrainedWeights(batch_normalization_39_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_mean_path = dir_prefix + std::string("batch_normalization_39_mean.bin"); - void* batch_normalization_39_mean = readTrainedWeights(batch_normalization_39_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_variance_path = dir_prefix + std::string("batch_normalization_39_variance.bin"); - void* batch_normalization_39_variance = readTrainedWeights(batch_normalization_39_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin"); - void* conv2d_40_w = readTrainedWeights(conv2d_40_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin"); - void* conv2d_40_b = readTrainedWeights(conv2d_40_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_gamma_path = dir_prefix + std::string("batch_normalization_40_gamma.bin"); - void* batch_normalization_40_gamma = readTrainedWeights(batch_normalization_40_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_beta_path = dir_prefix + std::string("batch_normalization_40_beta.bin"); - void* batch_normalization_40_beta = readTrainedWeights(batch_normalization_40_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_mean_path = dir_prefix + std::string("batch_normalization_40_mean.bin"); - void* batch_normalization_40_mean = readTrainedWeights(batch_normalization_40_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_variance_path = dir_prefix + std::string("batch_normalization_40_variance.bin"); - void* batch_normalization_40_variance = readTrainedWeights(batch_normalization_40_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin"); - void* conv2d_41_w = readTrainedWeights(conv2d_41_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin"); - void* conv2d_41_b = readTrainedWeights(conv2d_41_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_gamma_path = dir_prefix + std::string("batch_normalization_41_gamma.bin"); - void* batch_normalization_41_gamma = readTrainedWeights(batch_normalization_41_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_beta_path = dir_prefix + std::string("batch_normalization_41_beta.bin"); - void* batch_normalization_41_beta = readTrainedWeights(batch_normalization_41_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_mean_path = dir_prefix + std::string("batch_normalization_41_mean.bin"); - void* batch_normalization_41_mean = readTrainedWeights(batch_normalization_41_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_variance_path = dir_prefix + std::string("batch_normalization_41_variance.bin"); - void* batch_normalization_41_variance = readTrainedWeights(batch_normalization_41_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin"); - void* conv2d_42_w = readTrainedWeights(conv2d_42_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin"); - void* conv2d_42_b = readTrainedWeights(conv2d_42_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_gamma_path = dir_prefix + std::string("batch_normalization_42_gamma.bin"); - void* batch_normalization_42_gamma = readTrainedWeights(batch_normalization_42_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_beta_path = dir_prefix + std::string("batch_normalization_42_beta.bin"); - void* batch_normalization_42_beta = readTrainedWeights(batch_normalization_42_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_mean_path = dir_prefix + std::string("batch_normalization_42_mean.bin"); - void* batch_normalization_42_mean = readTrainedWeights(batch_normalization_42_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_variance_path = dir_prefix + std::string("batch_normalization_42_variance.bin"); - void* batch_normalization_42_variance = readTrainedWeights(batch_normalization_42_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin"); - void* conv2d_43_w = readTrainedWeights(conv2d_43_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin"); - void* conv2d_43_b = readTrainedWeights(conv2d_43_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_gamma_path = dir_prefix + std::string("batch_normalization_43_gamma.bin"); - void* batch_normalization_43_gamma = readTrainedWeights(batch_normalization_43_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_beta_path = dir_prefix + std::string("batch_normalization_43_beta.bin"); - void* batch_normalization_43_beta = readTrainedWeights(batch_normalization_43_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_mean_path = dir_prefix + std::string("batch_normalization_43_mean.bin"); - void* batch_normalization_43_mean = readTrainedWeights(batch_normalization_43_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_variance_path = dir_prefix + std::string("batch_normalization_43_variance.bin"); - void* batch_normalization_43_variance = readTrainedWeights(batch_normalization_43_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin"); - void* conv2d_44_w = readTrainedWeights(conv2d_44_w_path.c_str(), 0,512,1024,1,1); - std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin"); - void* conv2d_44_b = readTrainedWeights(conv2d_44_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_gamma_path = dir_prefix + std::string("batch_normalization_44_gamma.bin"); - void* batch_normalization_44_gamma = readTrainedWeights(batch_normalization_44_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_beta_path = dir_prefix + std::string("batch_normalization_44_beta.bin"); - void* batch_normalization_44_beta = readTrainedWeights(batch_normalization_44_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_mean_path = dir_prefix + std::string("batch_normalization_44_mean.bin"); - void* batch_normalization_44_mean = readTrainedWeights(batch_normalization_44_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_variance_path = dir_prefix + std::string("batch_normalization_44_variance.bin"); - void* batch_normalization_44_variance = readTrainedWeights(batch_normalization_44_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin"); - void* conv2d_45_w = readTrainedWeights(conv2d_45_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin"); - void* conv2d_45_b = readTrainedWeights(conv2d_45_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_gamma_path = dir_prefix + std::string("batch_normalization_45_gamma.bin"); - void* batch_normalization_45_gamma = readTrainedWeights(batch_normalization_45_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_beta_path = dir_prefix + std::string("batch_normalization_45_beta.bin"); - void* batch_normalization_45_beta = readTrainedWeights(batch_normalization_45_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_mean_path = dir_prefix + std::string("batch_normalization_45_mean.bin"); - void* batch_normalization_45_mean = readTrainedWeights(batch_normalization_45_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_variance_path = dir_prefix + std::string("batch_normalization_45_variance.bin"); - void* batch_normalization_45_variance = readTrainedWeights(batch_normalization_45_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin"); - void* conv2d_46_w = readTrainedWeights(conv2d_46_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin"); - void* conv2d_46_b = readTrainedWeights(conv2d_46_b_path.c_str(), 0,1,2048,1,1); - std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin"); - void* conv2d_47_w = readTrainedWeights(conv2d_47_w_path.c_str(), 0,2048,1024,1,1); - std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin"); - void* conv2d_47_b = readTrainedWeights(conv2d_47_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_gamma_path = dir_prefix + std::string("batch_normalization_46_gamma.bin"); - void* batch_normalization_46_gamma = readTrainedWeights(batch_normalization_46_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_beta_path = dir_prefix + std::string("batch_normalization_46_beta.bin"); - void* batch_normalization_46_beta = readTrainedWeights(batch_normalization_46_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_mean_path = dir_prefix + std::string("batch_normalization_46_mean.bin"); - void* batch_normalization_46_mean = readTrainedWeights(batch_normalization_46_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_variance_path = dir_prefix + std::string("batch_normalization_46_variance.bin"); - void* batch_normalization_46_variance = readTrainedWeights(batch_normalization_46_variance_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_gamma_path = dir_prefix + std::string("batch_normalization_47_gamma.bin"); - void* batch_normalization_47_gamma = readTrainedWeights(batch_normalization_47_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_beta_path = dir_prefix + std::string("batch_normalization_47_beta.bin"); - void* batch_normalization_47_beta = readTrainedWeights(batch_normalization_47_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_mean_path = dir_prefix + std::string("batch_normalization_47_mean.bin"); - void* batch_normalization_47_mean = readTrainedWeights(batch_normalization_47_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_variance_path = dir_prefix + std::string("batch_normalization_47_variance.bin"); - void* batch_normalization_47_variance = readTrainedWeights(batch_normalization_47_variance_path.c_str(), 0,1,2048,1,1); - std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin"); - void* conv2d_48_w = readTrainedWeights(conv2d_48_w_path.c_str(), 0,512,2048,1,1); - std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin"); - void* conv2d_48_b = readTrainedWeights(conv2d_48_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_gamma_path = dir_prefix + std::string("batch_normalization_48_gamma.bin"); - void* batch_normalization_48_gamma = readTrainedWeights(batch_normalization_48_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_beta_path = dir_prefix + std::string("batch_normalization_48_beta.bin"); - void* batch_normalization_48_beta = readTrainedWeights(batch_normalization_48_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_mean_path = dir_prefix + std::string("batch_normalization_48_mean.bin"); - void* batch_normalization_48_mean = readTrainedWeights(batch_normalization_48_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_variance_path = dir_prefix + std::string("batch_normalization_48_variance.bin"); - void* batch_normalization_48_variance = readTrainedWeights(batch_normalization_48_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin"); - void* conv2d_49_w = readTrainedWeights(conv2d_49_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin"); - void* conv2d_49_b = readTrainedWeights(conv2d_49_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_gamma_path = dir_prefix + std::string("batch_normalization_49_gamma.bin"); - void* batch_normalization_49_gamma = readTrainedWeights(batch_normalization_49_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_beta_path = dir_prefix + std::string("batch_normalization_49_beta.bin"); - void* batch_normalization_49_beta = readTrainedWeights(batch_normalization_49_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_mean_path = dir_prefix + std::string("batch_normalization_49_mean.bin"); - void* batch_normalization_49_mean = readTrainedWeights(batch_normalization_49_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_variance_path = dir_prefix + std::string("batch_normalization_49_variance.bin"); - void* batch_normalization_49_variance = readTrainedWeights(batch_normalization_49_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin"); - void* conv2d_50_w = readTrainedWeights(conv2d_50_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin"); - void* conv2d_50_b = readTrainedWeights(conv2d_50_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_gamma_path = dir_prefix + std::string("batch_normalization_50_gamma.bin"); - void* batch_normalization_50_gamma = readTrainedWeights(batch_normalization_50_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_beta_path = dir_prefix + std::string("batch_normalization_50_beta.bin"); - void* batch_normalization_50_beta = readTrainedWeights(batch_normalization_50_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_mean_path = dir_prefix + std::string("batch_normalization_50_mean.bin"); - void* batch_normalization_50_mean = readTrainedWeights(batch_normalization_50_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_variance_path = dir_prefix + std::string("batch_normalization_50_variance.bin"); - void* batch_normalization_50_variance = readTrainedWeights(batch_normalization_50_variance_path.c_str(), 0,1,2048,1,1); - std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin"); - void* conv2d_51_w = readTrainedWeights(conv2d_51_w_path.c_str(), 0,512,2048,1,1); - std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin"); - void* conv2d_51_b = readTrainedWeights(conv2d_51_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_gamma_path = dir_prefix + std::string("batch_normalization_51_gamma.bin"); - void* batch_normalization_51_gamma = readTrainedWeights(batch_normalization_51_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_beta_path = dir_prefix + std::string("batch_normalization_51_beta.bin"); - void* batch_normalization_51_beta = readTrainedWeights(batch_normalization_51_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_mean_path = dir_prefix + std::string("batch_normalization_51_mean.bin"); - void* batch_normalization_51_mean = readTrainedWeights(batch_normalization_51_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_variance_path = dir_prefix + std::string("batch_normalization_51_variance.bin"); - void* batch_normalization_51_variance = readTrainedWeights(batch_normalization_51_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin"); - void* conv2d_52_w = readTrainedWeights(conv2d_52_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin"); - void* conv2d_52_b = readTrainedWeights(conv2d_52_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_gamma_path = dir_prefix + std::string("batch_normalization_52_gamma.bin"); - void* batch_normalization_52_gamma = readTrainedWeights(batch_normalization_52_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_beta_path = dir_prefix + std::string("batch_normalization_52_beta.bin"); - void* batch_normalization_52_beta = readTrainedWeights(batch_normalization_52_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_mean_path = dir_prefix + std::string("batch_normalization_52_mean.bin"); - void* batch_normalization_52_mean = readTrainedWeights(batch_normalization_52_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_variance_path = dir_prefix + std::string("batch_normalization_52_variance.bin"); - void* batch_normalization_52_variance = readTrainedWeights(batch_normalization_52_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin"); - void* conv2d_53_w = readTrainedWeights(conv2d_53_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin"); - void* conv2d_53_b = readTrainedWeights(conv2d_53_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_gamma_path = dir_prefix + std::string("batch_normalization_53_gamma.bin"); - void* batch_normalization_53_gamma = readTrainedWeights(batch_normalization_53_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_beta_path = dir_prefix + std::string("batch_normalization_53_beta.bin"); - void* batch_normalization_53_beta = readTrainedWeights(batch_normalization_53_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_mean_path = dir_prefix + std::string("batch_normalization_53_mean.bin"); - void* batch_normalization_53_mean = readTrainedWeights(batch_normalization_53_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_variance_path = dir_prefix + std::string("batch_normalization_53_variance.bin"); - void* batch_normalization_53_variance = readTrainedWeights(batch_normalization_53_variance_path.c_str(), 0,1,2048,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,1000); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,1000,1,1); + startMemTracking(); + int test_input_size = 500; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + for (int i = 0; i < batch_count; i++) { - startMemTracking(); + int start = i * batch_size; + int end = (i + 1) * batch_size; - int test_input_size = 500; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); - for(int i = 0; i < batch_count; i++){ + void *var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2); + void *var_6 = tensorBatchNorm( + var_5, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorAdd(var_7, conv2d_2_b); + void *var_9 = tensorBatchNorm( + var_8, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_10 = tensorRelu(var_9); + void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_3_b); + void *var_13 = tensorBatchNorm( + var_12, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_16 = tensorAdd(var_15, conv2d_4_b); + void *var_17 = tensorBatchNorm( + var_16, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_19 = tensorAdd(var_18, conv2d_5_b); + void *var_20 = tensorBatchNorm( + var_19, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_21 = tensorAdd(var_17, var_20); + void *var_22 = tensorRelu(var_21); + void *var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_24 = tensorAdd(var_23, conv2d_6_b); + void *var_25 = tensorBatchNorm( + var_24, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); + void *var_28 = tensorAdd(var_27, conv2d_7_b); + void *var_29 = tensorBatchNorm( + var_28, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_30 = tensorRelu(var_29); + void *var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_32 = tensorAdd(var_31, conv2d_8_b); + void *var_33 = tensorBatchNorm( + var_32, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_34 = tensorAdd(var_33, var_22); + void *var_35 = tensorRelu(var_34); + void *var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorAdd(var_36, conv2d_9_b); + void *var_38 = tensorBatchNorm( + var_37, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_39 = tensorRelu(var_38); + void *var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); + void *var_41 = tensorAdd(var_40, conv2d_10_b); + void *var_42 = tensorBatchNorm( + var_41, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorAdd(var_44, conv2d_11_b); + void *var_46 = tensorBatchNorm( + var_45, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_47 = tensorAdd(var_46, var_35); + void *var_48 = tensorRelu(var_47); + void *var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); + void *var_50 = tensorAdd(var_49, conv2d_12_b); + void *var_51 = tensorBatchNorm( + var_50, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_52 = tensorRelu(var_51); + void *var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); + void *var_54 = tensorAdd(var_53, conv2d_13_b); + void *var_55 = tensorBatchNorm( + var_54, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_56 = tensorRelu(var_55); + void *var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_58 = tensorAdd(var_57, conv2d_14_b); + void *var_59 = tensorBatchNorm( + var_58, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); + void *var_61 = tensorAdd(var_60, conv2d_15_b); + void *var_62 = tensorBatchNorm( + var_61, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_63 = tensorAdd(var_59, var_62); + void *var_64 = tensorRelu(var_63); + void *var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); + void *var_66 = tensorAdd(var_65, conv2d_16_b); + void *var_67 = tensorBatchNorm( + var_66, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_68 = tensorRelu(var_67); + void *var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); + void *var_70 = tensorAdd(var_69, conv2d_17_b); + void *var_71 = tensorBatchNorm( + var_70, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_72 = tensorRelu(var_71); + void *var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorAdd(var_73, conv2d_18_b); + void *var_75 = tensorBatchNorm( + var_74, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_76 = tensorAdd(var_75, var_64); + void *var_77 = tensorRelu(var_76); + void *var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); + void *var_79 = tensorAdd(var_78, conv2d_19_b); + void *var_80 = tensorBatchNorm( + var_79, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_81 = tensorRelu(var_80); + void *var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); + void *var_83 = tensorAdd(var_82, conv2d_20_b); + void *var_84 = tensorBatchNorm( + var_83, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_85 = tensorRelu(var_84); + void *var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); + void *var_87 = tensorAdd(var_86, conv2d_21_b); + void *var_88 = tensorBatchNorm( + var_87, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_89 = tensorAdd(var_88, var_77); + void *var_90 = tensorRelu(var_89); + void *var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); + void *var_92 = tensorAdd(var_91, conv2d_22_b); + void *var_93 = tensorBatchNorm( + var_92, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_94 = tensorRelu(var_93); + void *var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); + void *var_96 = tensorAdd(var_95, conv2d_23_b); + void *var_97 = tensorBatchNorm( + var_96, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_98 = tensorRelu(var_97); + void *var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); + void *var_100 = tensorAdd(var_99, conv2d_24_b); + void *var_101 = tensorBatchNorm( + var_100, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_102 = tensorAdd(var_101, var_90); + void *var_103 = tensorRelu(var_102); + void *var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); + void *var_105 = tensorAdd(var_104, conv2d_25_b); + void *var_106 = tensorBatchNorm( + var_105, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_107 = tensorRelu(var_106); + void *var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); + void *var_109 = tensorAdd(var_108, conv2d_26_b); + void *var_110 = tensorBatchNorm( + var_109, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_111 = tensorRelu(var_110); + void *var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); + void *var_113 = tensorAdd(var_112, conv2d_27_b); + void *var_114 = tensorBatchNorm( + var_113, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); + void *var_116 = tensorAdd(var_115, conv2d_28_b); + void *var_117 = tensorBatchNorm( + var_116, batch_normalization_28_gamma, batch_normalization_28_beta, + batch_normalization_28_mean, batch_normalization_28_variance, 0.001); + void *var_118 = tensorAdd(var_114, var_117); + void *var_119 = tensorRelu(var_118); + void *var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); + void *var_121 = tensorAdd(var_120, conv2d_29_b); + void *var_122 = tensorBatchNorm( + var_121, batch_normalization_29_gamma, batch_normalization_29_beta, + batch_normalization_29_mean, batch_normalization_29_variance, 0.001); + void *var_123 = tensorRelu(var_122); + void *var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); + void *var_125 = tensorAdd(var_124, conv2d_30_b); + void *var_126 = tensorBatchNorm( + var_125, batch_normalization_30_gamma, batch_normalization_30_beta, + batch_normalization_30_mean, batch_normalization_30_variance, 0.001); + void *var_127 = tensorRelu(var_126); + void *var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); + void *var_129 = tensorAdd(var_128, conv2d_31_b); + void *var_130 = tensorBatchNorm( + var_129, batch_normalization_31_gamma, batch_normalization_31_beta, + batch_normalization_31_mean, batch_normalization_31_variance, 0.001); + void *var_131 = tensorAdd(var_130, var_119); + void *var_132 = tensorRelu(var_131); + void *var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); + void *var_134 = tensorAdd(var_133, conv2d_32_b); + void *var_135 = tensorBatchNorm( + var_134, batch_normalization_32_gamma, batch_normalization_32_beta, + batch_normalization_32_mean, batch_normalization_32_variance, 0.001); + void *var_136 = tensorRelu(var_135); + void *var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); + void *var_138 = tensorAdd(var_137, conv2d_33_b); + void *var_139 = tensorBatchNorm( + var_138, batch_normalization_33_gamma, batch_normalization_33_beta, + batch_normalization_33_mean, batch_normalization_33_variance, 0.001); + void *var_140 = tensorRelu(var_139); + void *var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); + void *var_142 = tensorAdd(var_141, conv2d_34_b); + void *var_143 = tensorBatchNorm( + var_142, batch_normalization_34_gamma, batch_normalization_34_beta, + batch_normalization_34_mean, batch_normalization_34_variance, 0.001); + void *var_144 = tensorAdd(var_143, var_132); + void *var_145 = tensorRelu(var_144); + void *var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); + void *var_147 = tensorAdd(var_146, conv2d_35_b); + void *var_148 = tensorBatchNorm( + var_147, batch_normalization_35_gamma, batch_normalization_35_beta, + batch_normalization_35_mean, batch_normalization_35_variance, 0.001); + void *var_149 = tensorRelu(var_148); + void *var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); + void *var_151 = tensorAdd(var_150, conv2d_36_b); + void *var_152 = tensorBatchNorm( + var_151, batch_normalization_36_gamma, batch_normalization_36_beta, + batch_normalization_36_mean, batch_normalization_36_variance, 0.001); + void *var_153 = tensorRelu(var_152); + void *var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); + void *var_155 = tensorAdd(var_154, conv2d_37_b); + void *var_156 = tensorBatchNorm( + var_155, batch_normalization_37_gamma, batch_normalization_37_beta, + batch_normalization_37_mean, batch_normalization_37_variance, 0.001); + void *var_157 = tensorAdd(var_156, var_145); + void *var_158 = tensorRelu(var_157); + void *var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); + void *var_160 = tensorAdd(var_159, conv2d_38_b); + void *var_161 = tensorBatchNorm( + var_160, batch_normalization_38_gamma, batch_normalization_38_beta, + batch_normalization_38_mean, batch_normalization_38_variance, 0.001); + void *var_162 = tensorRelu(var_161); + void *var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); + void *var_164 = tensorAdd(var_163, conv2d_39_b); + void *var_165 = tensorBatchNorm( + var_164, batch_normalization_39_gamma, batch_normalization_39_beta, + batch_normalization_39_mean, batch_normalization_39_variance, 0.001); + void *var_166 = tensorRelu(var_165); + void *var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); + void *var_168 = tensorAdd(var_167, conv2d_40_b); + void *var_169 = tensorBatchNorm( + var_168, batch_normalization_40_gamma, batch_normalization_40_beta, + batch_normalization_40_mean, batch_normalization_40_variance, 0.001); + void *var_170 = tensorAdd(var_169, var_158); + void *var_171 = tensorRelu(var_170); + void *var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); + void *var_173 = tensorAdd(var_172, conv2d_41_b); + void *var_174 = tensorBatchNorm( + var_173, batch_normalization_41_gamma, batch_normalization_41_beta, + batch_normalization_41_mean, batch_normalization_41_variance, 0.001); + void *var_175 = tensorRelu(var_174); + void *var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); + void *var_177 = tensorAdd(var_176, conv2d_42_b); + void *var_178 = tensorBatchNorm( + var_177, batch_normalization_42_gamma, batch_normalization_42_beta, + batch_normalization_42_mean, batch_normalization_42_variance, 0.001); + void *var_179 = tensorRelu(var_178); + void *var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); + void *var_181 = tensorAdd(var_180, conv2d_43_b); + void *var_182 = tensorBatchNorm( + var_181, batch_normalization_43_gamma, batch_normalization_43_beta, + batch_normalization_43_mean, batch_normalization_43_variance, 0.001); + void *var_183 = tensorAdd(var_182, var_171); + void *var_184 = tensorRelu(var_183); + void *var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); + void *var_186 = tensorAdd(var_185, conv2d_44_b); + void *var_187 = tensorBatchNorm( + var_186, batch_normalization_44_gamma, batch_normalization_44_beta, + batch_normalization_44_mean, batch_normalization_44_variance, 0.001); + void *var_188 = tensorRelu(var_187); + void *var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); + void *var_190 = tensorAdd(var_189, conv2d_45_b); + void *var_191 = tensorBatchNorm( + var_190, batch_normalization_45_gamma, batch_normalization_45_beta, + batch_normalization_45_mean, batch_normalization_45_variance, 0.001); + void *var_192 = tensorRelu(var_191); + void *var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); + void *var_194 = tensorAdd(var_193, conv2d_46_b); + void *var_195 = tensorBatchNorm( + var_194, batch_normalization_46_gamma, batch_normalization_46_beta, + batch_normalization_46_mean, batch_normalization_46_variance, 0.001); + void *var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); + void *var_197 = tensorAdd(var_196, conv2d_47_b); + void *var_198 = tensorBatchNorm( + var_197, batch_normalization_47_gamma, batch_normalization_47_beta, + batch_normalization_47_mean, batch_normalization_47_variance, 0.001); + void *var_199 = tensorAdd(var_195, var_198); + void *var_200 = tensorRelu(var_199); + void *var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); + void *var_202 = tensorAdd(var_201, conv2d_48_b); + void *var_203 = tensorBatchNorm( + var_202, batch_normalization_48_gamma, batch_normalization_48_beta, + batch_normalization_48_mean, batch_normalization_48_variance, 0.001); + void *var_204 = tensorRelu(var_203); + void *var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); + void *var_206 = tensorAdd(var_205, conv2d_49_b); + void *var_207 = tensorBatchNorm( + var_206, batch_normalization_49_gamma, batch_normalization_49_beta, + batch_normalization_49_mean, batch_normalization_49_variance, 0.001); + void *var_208 = tensorRelu(var_207); + void *var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); + void *var_210 = tensorAdd(var_209, conv2d_50_b); + void *var_211 = tensorBatchNorm( + var_210, batch_normalization_50_gamma, batch_normalization_50_beta, + batch_normalization_50_mean, batch_normalization_50_variance, 0.001); + void *var_212 = tensorAdd(var_211, var_200); + void *var_213 = tensorRelu(var_212); + void *var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); + void *var_215 = tensorAdd(var_214, conv2d_51_b); + void *var_216 = tensorBatchNorm( + var_215, batch_normalization_51_gamma, batch_normalization_51_beta, + batch_normalization_51_mean, batch_normalization_51_variance, 0.001); + void *var_217 = tensorRelu(var_216); + void *var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); + void *var_219 = tensorAdd(var_218, conv2d_52_b); + void *var_220 = tensorBatchNorm( + var_219, batch_normalization_52_gamma, batch_normalization_52_beta, + batch_normalization_52_mean, batch_normalization_52_variance, 0.001); + void *var_221 = tensorRelu(var_220); + void *var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); + void *var_223 = tensorAdd(var_222, conv2d_53_b); + void *var_224 = tensorBatchNorm( + var_223, batch_normalization_53_gamma, batch_normalization_53_beta, + batch_normalization_53_mean, batch_normalization_53_variance, 0.001); + void *var_225 = tensorAdd(var_224, var_213); + void *var_226 = tensorRelu(var_225); + void *var_227 = tensorPooling(var_226, 1, 7, 7, 0, 0, 7, 7); + void *var_229 = tensorGemmGPU(var_227, dense_1_w); + void *var_230 = tensorAdd(var_229, dense_1_b); + void *var_231 = tensorSoftmax(var_230); - int start = i * batch_size; - int end = (i + 1) * batch_size; + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); - void* var_6 = tensorBatchNorm(var_5, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorAdd(var_7, conv2d_2_b); - void* var_9 = tensorBatchNorm(var_8, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_10 = tensorRelu(var_9); - void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_3_b); - void* var_13 = tensorBatchNorm(var_12, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_16 = tensorAdd(var_15, conv2d_4_b); - void* var_17 = tensorBatchNorm(var_16, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_19 = tensorAdd(var_18, conv2d_5_b); - void* var_20 = tensorBatchNorm(var_19, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_21 = tensorAdd(var_17, var_20); - void* var_22 = tensorRelu(var_21); - void* var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_24 = tensorAdd(var_23, conv2d_6_b); - void* var_25 = tensorBatchNorm(var_24, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); - void* var_28 = tensorAdd(var_27, conv2d_7_b); - void* var_29 = tensorBatchNorm(var_28, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_30 = tensorRelu(var_29); - void* var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_32 = tensorAdd(var_31, conv2d_8_b); - void* var_33 = tensorBatchNorm(var_32, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_34 = tensorAdd(var_33, var_22); - void* var_35 = tensorRelu(var_34); - void* var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorAdd(var_36, conv2d_9_b); - void* var_38 = tensorBatchNorm(var_37, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_39 = tensorRelu(var_38); - void* var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); - void* var_41 = tensorAdd(var_40, conv2d_10_b); - void* var_42 = tensorBatchNorm(var_41, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorAdd(var_44, conv2d_11_b); - void* var_46 = tensorBatchNorm(var_45, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_47 = tensorAdd(var_46, var_35); - void* var_48 = tensorRelu(var_47); - void* var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); - void* var_50 = tensorAdd(var_49, conv2d_12_b); - void* var_51 = tensorBatchNorm(var_50, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_52 = tensorRelu(var_51); - void* var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); - void* var_54 = tensorAdd(var_53, conv2d_13_b); - void* var_55 = tensorBatchNorm(var_54, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_56 = tensorRelu(var_55); - void* var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_58 = tensorAdd(var_57, conv2d_14_b); - void* var_59 = tensorBatchNorm(var_58, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); - void* var_61 = tensorAdd(var_60, conv2d_15_b); - void* var_62 = tensorBatchNorm(var_61, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_63 = tensorAdd(var_59, var_62); - void* var_64 = tensorRelu(var_63); - void* var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); - void* var_66 = tensorAdd(var_65, conv2d_16_b); - void* var_67 = tensorBatchNorm(var_66, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_68 = tensorRelu(var_67); - void* var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); - void* var_70 = tensorAdd(var_69, conv2d_17_b); - void* var_71 = tensorBatchNorm(var_70, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_72 = tensorRelu(var_71); - void* var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorAdd(var_73, conv2d_18_b); - void* var_75 = tensorBatchNorm(var_74, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_76 = tensorAdd(var_75, var_64); - void* var_77 = tensorRelu(var_76); - void* var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); - void* var_79 = tensorAdd(var_78, conv2d_19_b); - void* var_80 = tensorBatchNorm(var_79, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_81 = tensorRelu(var_80); - void* var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); - void* var_83 = tensorAdd(var_82, conv2d_20_b); - void* var_84 = tensorBatchNorm(var_83, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_85 = tensorRelu(var_84); - void* var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); - void* var_87 = tensorAdd(var_86, conv2d_21_b); - void* var_88 = tensorBatchNorm(var_87, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_89 = tensorAdd(var_88, var_77); - void* var_90 = tensorRelu(var_89); - void* var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); - void* var_92 = tensorAdd(var_91, conv2d_22_b); - void* var_93 = tensorBatchNorm(var_92, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_94 = tensorRelu(var_93); - void* var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); - void* var_96 = tensorAdd(var_95, conv2d_23_b); - void* var_97 = tensorBatchNorm(var_96, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_98 = tensorRelu(var_97); - void* var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); - void* var_100 = tensorAdd(var_99, conv2d_24_b); - void* var_101 = tensorBatchNorm(var_100, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_102 = tensorAdd(var_101, var_90); - void* var_103 = tensorRelu(var_102); - void* var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); - void* var_105 = tensorAdd(var_104, conv2d_25_b); - void* var_106 = tensorBatchNorm(var_105, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_107 = tensorRelu(var_106); - void* var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); - void* var_109 = tensorAdd(var_108, conv2d_26_b); - void* var_110 = tensorBatchNorm(var_109, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_111 = tensorRelu(var_110); - void* var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); - void* var_113 = tensorAdd(var_112, conv2d_27_b); - void* var_114 = tensorBatchNorm(var_113, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); - void* var_116 = tensorAdd(var_115, conv2d_28_b); - void* var_117 = tensorBatchNorm(var_116, batch_normalization_28_gamma, batch_normalization_28_beta, batch_normalization_28_mean, batch_normalization_28_variance, 0.001); - void* var_118 = tensorAdd(var_114, var_117); - void* var_119 = tensorRelu(var_118); - void* var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); - void* var_121 = tensorAdd(var_120, conv2d_29_b); - void* var_122 = tensorBatchNorm(var_121, batch_normalization_29_gamma, batch_normalization_29_beta, batch_normalization_29_mean, batch_normalization_29_variance, 0.001); - void* var_123 = tensorRelu(var_122); - void* var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); - void* var_125 = tensorAdd(var_124, conv2d_30_b); - void* var_126 = tensorBatchNorm(var_125, batch_normalization_30_gamma, batch_normalization_30_beta, batch_normalization_30_mean, batch_normalization_30_variance, 0.001); - void* var_127 = tensorRelu(var_126); - void* var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); - void* var_129 = tensorAdd(var_128, conv2d_31_b); - void* var_130 = tensorBatchNorm(var_129, batch_normalization_31_gamma, batch_normalization_31_beta, batch_normalization_31_mean, batch_normalization_31_variance, 0.001); - void* var_131 = tensorAdd(var_130, var_119); - void* var_132 = tensorRelu(var_131); - void* var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); - void* var_134 = tensorAdd(var_133, conv2d_32_b); - void* var_135 = tensorBatchNorm(var_134, batch_normalization_32_gamma, batch_normalization_32_beta, batch_normalization_32_mean, batch_normalization_32_variance, 0.001); - void* var_136 = tensorRelu(var_135); - void* var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); - void* var_138 = tensorAdd(var_137, conv2d_33_b); - void* var_139 = tensorBatchNorm(var_138, batch_normalization_33_gamma, batch_normalization_33_beta, batch_normalization_33_mean, batch_normalization_33_variance, 0.001); - void* var_140 = tensorRelu(var_139); - void* var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); - void* var_142 = tensorAdd(var_141, conv2d_34_b); - void* var_143 = tensorBatchNorm(var_142, batch_normalization_34_gamma, batch_normalization_34_beta, batch_normalization_34_mean, batch_normalization_34_variance, 0.001); - void* var_144 = tensorAdd(var_143, var_132); - void* var_145 = tensorRelu(var_144); - void* var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); - void* var_147 = tensorAdd(var_146, conv2d_35_b); - void* var_148 = tensorBatchNorm(var_147, batch_normalization_35_gamma, batch_normalization_35_beta, batch_normalization_35_mean, batch_normalization_35_variance, 0.001); - void* var_149 = tensorRelu(var_148); - void* var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); - void* var_151 = tensorAdd(var_150, conv2d_36_b); - void* var_152 = tensorBatchNorm(var_151, batch_normalization_36_gamma, batch_normalization_36_beta, batch_normalization_36_mean, batch_normalization_36_variance, 0.001); - void* var_153 = tensorRelu(var_152); - void* var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); - void* var_155 = tensorAdd(var_154, conv2d_37_b); - void* var_156 = tensorBatchNorm(var_155, batch_normalization_37_gamma, batch_normalization_37_beta, batch_normalization_37_mean, batch_normalization_37_variance, 0.001); - void* var_157 = tensorAdd(var_156, var_145); - void* var_158 = tensorRelu(var_157); - void* var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); - void* var_160 = tensorAdd(var_159, conv2d_38_b); - void* var_161 = tensorBatchNorm(var_160, batch_normalization_38_gamma, batch_normalization_38_beta, batch_normalization_38_mean, batch_normalization_38_variance, 0.001); - void* var_162 = tensorRelu(var_161); - void* var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); - void* var_164 = tensorAdd(var_163, conv2d_39_b); - void* var_165 = tensorBatchNorm(var_164, batch_normalization_39_gamma, batch_normalization_39_beta, batch_normalization_39_mean, batch_normalization_39_variance, 0.001); - void* var_166 = tensorRelu(var_165); - void* var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); - void* var_168 = tensorAdd(var_167, conv2d_40_b); - void* var_169 = tensorBatchNorm(var_168, batch_normalization_40_gamma, batch_normalization_40_beta, batch_normalization_40_mean, batch_normalization_40_variance, 0.001); - void* var_170 = tensorAdd(var_169, var_158); - void* var_171 = tensorRelu(var_170); - void* var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); - void* var_173 = tensorAdd(var_172, conv2d_41_b); - void* var_174 = tensorBatchNorm(var_173, batch_normalization_41_gamma, batch_normalization_41_beta, batch_normalization_41_mean, batch_normalization_41_variance, 0.001); - void* var_175 = tensorRelu(var_174); - void* var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); - void* var_177 = tensorAdd(var_176, conv2d_42_b); - void* var_178 = tensorBatchNorm(var_177, batch_normalization_42_gamma, batch_normalization_42_beta, batch_normalization_42_mean, batch_normalization_42_variance, 0.001); - void* var_179 = tensorRelu(var_178); - void* var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); - void* var_181 = tensorAdd(var_180, conv2d_43_b); - void* var_182 = tensorBatchNorm(var_181, batch_normalization_43_gamma, batch_normalization_43_beta, batch_normalization_43_mean, batch_normalization_43_variance, 0.001); - void* var_183 = tensorAdd(var_182, var_171); - void* var_184 = tensorRelu(var_183); - void* var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); - void* var_186 = tensorAdd(var_185, conv2d_44_b); - void* var_187 = tensorBatchNorm(var_186, batch_normalization_44_gamma, batch_normalization_44_beta, batch_normalization_44_mean, batch_normalization_44_variance, 0.001); - void* var_188 = tensorRelu(var_187); - void* var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); - void* var_190 = tensorAdd(var_189, conv2d_45_b); - void* var_191 = tensorBatchNorm(var_190, batch_normalization_45_gamma, batch_normalization_45_beta, batch_normalization_45_mean, batch_normalization_45_variance, 0.001); - void* var_192 = tensorRelu(var_191); - void* var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); - void* var_194 = tensorAdd(var_193, conv2d_46_b); - void* var_195 = tensorBatchNorm(var_194, batch_normalization_46_gamma, batch_normalization_46_beta, batch_normalization_46_mean, batch_normalization_46_variance, 0.001); - void* var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); - void* var_197 = tensorAdd(var_196, conv2d_47_b); - void* var_198 = tensorBatchNorm(var_197, batch_normalization_47_gamma, batch_normalization_47_beta, batch_normalization_47_mean, batch_normalization_47_variance, 0.001); - void* var_199 = tensorAdd(var_195, var_198); - void* var_200 = tensorRelu(var_199); - void* var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); - void* var_202 = tensorAdd(var_201, conv2d_48_b); - void* var_203 = tensorBatchNorm(var_202, batch_normalization_48_gamma, batch_normalization_48_beta, batch_normalization_48_mean, batch_normalization_48_variance, 0.001); - void* var_204 = tensorRelu(var_203); - void* var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); - void* var_206 = tensorAdd(var_205, conv2d_49_b); - void* var_207 = tensorBatchNorm(var_206, batch_normalization_49_gamma, batch_normalization_49_beta, batch_normalization_49_mean, batch_normalization_49_variance, 0.001); - void* var_208 = tensorRelu(var_207); - void* var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); - void* var_210 = tensorAdd(var_209, conv2d_50_b); - void* var_211 = tensorBatchNorm(var_210, batch_normalization_50_gamma, batch_normalization_50_beta, batch_normalization_50_mean, batch_normalization_50_variance, 0.001); - void* var_212 = tensorAdd(var_211, var_200); - void* var_213 = tensorRelu(var_212); - void* var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); - void* var_215 = tensorAdd(var_214, conv2d_51_b); - void* var_216 = tensorBatchNorm(var_215, batch_normalization_51_gamma, batch_normalization_51_beta, batch_normalization_51_mean, batch_normalization_51_variance, 0.001); - void* var_217 = tensorRelu(var_216); - void* var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); - void* var_219 = tensorAdd(var_218, conv2d_52_b); - void* var_220 = tensorBatchNorm(var_219, batch_normalization_52_gamma, batch_normalization_52_beta, batch_normalization_52_mean, batch_normalization_52_variance, 0.001); - void* var_221 = tensorRelu(var_220); - void* var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); - void* var_223 = tensorAdd(var_222, conv2d_53_b); - void* var_224 = tensorBatchNorm(var_223, batch_normalization_53_gamma, batch_normalization_53_beta, batch_normalization_53_mean, batch_normalization_53_variance, 0.001); - void* var_225 = tensorAdd(var_224, var_213); - void* var_226 = tensorRelu(var_225); - void* var_227 = tensorPooling(var_226,1,7,7,0,0,7,7); - void* var_229 = tensorGemmGPU(var_227, dense_1_w); - void* var_230 = tensorAdd(var_229, dense_1_b); - void* var_231 = tensorSoftmax(var_230); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_231); - final_accuracy += accuracy; - freeBatchMemory(); - + float accuracy = computeAccuracy3(labels, var_231); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc index a6dc7cbc11cf77357a749bff117489fc4b292941..034ddb0cf8d6b286544c669375a46746ad23d4d2 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc @@ -1,82 +1,103 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -85,77 +106,76 @@ int main(){ int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; - for(int i = 0; i < batch_count; i++){ + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorAdd(var_12, conv2d_4_b); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorRelu(var_17); - void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorAdd(var_20, conv2d_6_b); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_7_b); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_8_b); - void* var_30 = tensorRelu(var_29); - void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorAdd(var_32, conv2d_9_b); - void* var_34 = tensorRelu(var_33); - void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorAdd(var_36, conv2d_10_b); - void* var_38 = tensorRelu(var_37); - void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_11_b); - void* var_42 = tensorRelu(var_41); - void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorAdd(var_44, conv2d_12_b); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorAdd(var_48, conv2d_13_b); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorGemmGPU(var_51, dense_1_w); - void* var_55 = tensorAdd(var_54, dense_1_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorGemmGPU(var_56, dense_2_w); - void* var_59 = tensorAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_60); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorRelu(var_1); + void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorAdd(var_12, conv2d_4_b); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorRelu(var_17); + void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorAdd(var_20, conv2d_6_b); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_7_b); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_8_b); + void *var_30 = tensorRelu(var_29); + void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorAdd(var_32, conv2d_9_b); + void *var_34 = tensorRelu(var_33); + void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorAdd(var_36, conv2d_10_b); + void *var_38 = tensorRelu(var_37); + void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_11_b); + void *var_42 = tensorRelu(var_41); + void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorAdd(var_44, conv2d_12_b); + void *var_46 = tensorRelu(var_45); + void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorAdd(var_48, conv2d_13_b); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorGemmGPU(var_51, dense_1_w); + void *var_55 = tensorAdd(var_54, dense_1_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorGemmGPU(var_56, dense_2_w); + void *var_59 = tensorAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60); final_accuracy += accuracy; - + freeBatchMemory(); } final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc index 2539f8d8722909724a9dc2890e82f4f98853f5cd..94ca77329bc2f31d251590df3916d3cb10673fda 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc @@ -1,161 +1,181 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); - - - startMemTracking(); - - int test_input_size = 5000; - int batch_size = 5000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorAdd(var_12, conv2d_4_b); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorRelu(var_17); - void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorAdd(var_20, conv2d_6_b); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_7_b); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_8_b); - void* var_30 = tensorRelu(var_29); - void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorAdd(var_32, conv2d_9_b); - void* var_34 = tensorRelu(var_33); - void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorAdd(var_36, conv2d_10_b); - void* var_38 = tensorRelu(var_37); - void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_11_b); - void* var_42 = tensorRelu(var_41); - void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorAdd(var_44, conv2d_12_b); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorAdd(var_48, conv2d_13_b); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorGemmGPU(var_51, dense_1_w); - void* var_55 = tensorAdd(var_54, dense_1_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorGemmGPU(var_56, dense_2_w); - void* var_59 = tensorAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); - final_accuracy += accuracy; - freeBatchMemory(); - + +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 5000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorRelu(var_1); + void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorAdd(var_12, conv2d_4_b); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorRelu(var_17); + void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorAdd(var_20, conv2d_6_b); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_7_b); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_8_b); + void *var_30 = tensorRelu(var_29); + void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorAdd(var_32, conv2d_9_b); + void *var_34 = tensorRelu(var_33); + void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorAdd(var_36, conv2d_10_b); + void *var_38 = tensorRelu(var_37); + void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_11_b); + void *var_42 = tensorRelu(var_41); + void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorAdd(var_44, conv2d_12_b); + void *var_46 = tensorRelu(var_45); + void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorAdd(var_48, conv2d_13_b); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorGemmGPU(var_51, dense_1_w); + void *var_55 = tensorAdd(var_54, dense_1_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorGemmGPU(var_56, dense_2_w); + void *var_59 = tensorAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - llvm_hpvm_cleanupTensorRt(); + llvm_hpvm_cleanupTensorRt(); - return 0; + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc index 1d78065c5725deae9c14fc97a699fc14f55ad8ef..c5da3faf7860df24e25293acaacc1c50bcdceb72 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc @@ -1,173 +1,193 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "tensor_runtime.h" -#include "utils.h" - - - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,25088,4096); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); - std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); - void* dense_3_w = readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); - std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); - void* dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); - - - - startMemTracking(); - - int test_input_size = 500; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_2 = tensorAdd(var_1, conv2d_1_b); - void* var_3 = tensorRelu(var_2); - void* var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_4_b); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorPooling(var_13,0,2,2,0,0,2,2); - void* var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); - void* var_16 = tensorAdd(var_15, conv2d_5_b); - void* var_17 = tensorRelu(var_16); - void* var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); - void* var_19 = tensorAdd(var_18, conv2d_6_b); - void* var_20 = tensorRelu(var_19); - void* var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); - void* var_22 = tensorAdd(var_21, conv2d_7_b); - void* var_23 = tensorRelu(var_22); - void* var_24 = tensorPooling(var_23,0,2,2,0,0,2,2); - void* var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); - void* var_26 = tensorAdd(var_25, conv2d_8_b); - void* var_27 = tensorRelu(var_26); - void* var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); - void* var_29 = tensorAdd(var_28, conv2d_9_b); - void* var_30 = tensorRelu(var_29); - void* var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); - void* var_32 = tensorAdd(var_31, conv2d_10_b); - void* var_33 = tensorRelu(var_32); - void* var_34 = tensorPooling(var_33,0,2,2,0,0,2,2); - void* var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); - void* var_36 = tensorAdd(var_35, conv2d_11_b); - void* var_37 = tensorRelu(var_36); - void* var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); - void* var_39 = tensorAdd(var_38, conv2d_12_b); - void* var_40 = tensorRelu(var_39); - void* var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); - void* var_42 = tensorAdd(var_41, conv2d_13_b); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorPooling(var_43,0,2,2,0,0,2,2); - void* var_46 = tensorGemmGPU(var_44, dense_1_w); - void* var_47 = tensorAdd(var_46, dense_1_b); - void* var_48 = tensorRelu(var_47); - void* var_49 = tensorGemmGPU(var_48, dense_2_w); - void* var_50 = tensorAdd(var_49, dense_2_b); - void* var_51 = tensorRelu(var_50); - void* var_52 = tensorGemmGPU(var_51, dense_3_w); - void* var_53 = tensorAdd(var_52, dense_3_b); - void* var_54 = tensorSoftmax(var_53); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_54); - final_accuracy += accuracy; - freeBatchMemory(); - - } - - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); +#include "tensor_runtime.h" +#include "utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 25088, 4096); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = + readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); + void *dense_3_w = + readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000); + std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); + void *dense_3_b = + readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); + + startMemTracking(); + + int test_input_size = 500; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + void *var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_2 = tensorAdd(var_1, conv2d_1_b); + void *var_3 = tensorRelu(var_2); + void *var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_4_b); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorPooling(var_13, 0, 2, 2, 0, 0, 2, 2); + void *var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); + void *var_16 = tensorAdd(var_15, conv2d_5_b); + void *var_17 = tensorRelu(var_16); + void *var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); + void *var_19 = tensorAdd(var_18, conv2d_6_b); + void *var_20 = tensorRelu(var_19); + void *var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); + void *var_22 = tensorAdd(var_21, conv2d_7_b); + void *var_23 = tensorRelu(var_22); + void *var_24 = tensorPooling(var_23, 0, 2, 2, 0, 0, 2, 2); + void *var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); + void *var_26 = tensorAdd(var_25, conv2d_8_b); + void *var_27 = tensorRelu(var_26); + void *var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); + void *var_29 = tensorAdd(var_28, conv2d_9_b); + void *var_30 = tensorRelu(var_29); + void *var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); + void *var_32 = tensorAdd(var_31, conv2d_10_b); + void *var_33 = tensorRelu(var_32); + void *var_34 = tensorPooling(var_33, 0, 2, 2, 0, 0, 2, 2); + void *var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); + void *var_36 = tensorAdd(var_35, conv2d_11_b); + void *var_37 = tensorRelu(var_36); + void *var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); + void *var_39 = tensorAdd(var_38, conv2d_12_b); + void *var_40 = tensorRelu(var_39); + void *var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); + void *var_42 = tensorAdd(var_41, conv2d_13_b); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorPooling(var_43, 0, 2, 2, 0, 0, 2, 2); + void *var_46 = tensorGemmGPU(var_44, dense_1_w); + void *var_47 = tensorAdd(var_46, dense_1_b); + void *var_48 = tensorRelu(var_47); + void *var_49 = tensorGemmGPU(var_48, dense_2_w); + void *var_50 = tensorAdd(var_49, dense_2_b); + void *var_51 = tensorRelu(var_50); + void *var_52 = tensorGemmGPU(var_51, dense_3_w); + void *var_53 = tensorAdd(var_52, dense_3_b); + void *var_54 = tensorSoftmax(var_53); + + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy3(labels, var_54); + final_accuracy += accuracy; + freeBatchMemory(); + } - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc index 3b08755172973d63132bcd1c5b19d9e58ec38611..ea959342a4ac034deeba4191faa6620f2ec81037 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc @@ -10,10 +10,7 @@ using namespace std; - - - -class UnitTestResults{ +class UnitTestResults { private: unsigned int total_tests; @@ -22,48 +19,46 @@ private: std::vector<string> failed_test_ids; public: - - UnitTestResults(){ + UnitTestResults() { total_tests = 0; failed_tests = 0; passed_tests = 0; } - void evalTestResult(Tensor* res, const float* expected_result, size_t num_elems, - float epsilon, string test_name){ + void evalTestResult(Tensor *res, const float *expected_result, + size_t num_elems, float epsilon, string test_name) { - total_tests += 1; - if(res->num_elems != num_elems){ + total_tests += 1; + if (res->num_elems != num_elems) { failed_tests += 1; failed_test_ids.push_back(test_name); return; } - float* data_ptr = (float*) res->host_data; - for (unsigned int i = 0; i < res->num_elems; i++){ - //printf("**diff value = %f ", std::abs(data_ptr[i] - expected_result[i])); - if (std::abs(data_ptr[i] - expected_result[i]) > epsilon){ - failed_tests += 1; - failed_test_ids.push_back(test_name); + float *data_ptr = (float *)res->host_data; + for (unsigned int i = 0; i < res->num_elems; i++) { + // printf("**diff value = %f ", std::abs(data_ptr[i] - + // expected_result[i])); + if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) { + failed_tests += 1; + failed_test_ids.push_back(test_name); return; } } - - passed_tests += 1; + + passed_tests += 1; } - void compareTensors(Tensor* res, Tensor* gold_res, - float epsilon, string test_name){ + void compareTensors(Tensor *res, Tensor *gold_res, float epsilon, + string test_name) { - const float* expected_result = (float*) gold_res->host_data; + const float *expected_result = (float *)gold_res->host_data; unsigned int num_elems = res->num_elems; evalTestResult(res, expected_result, num_elems, epsilon, test_name); - } - - void printSummary(){ + void printSummary() { printf("\n\n\n ************* Printing Results Summary ********** \n\n"); printf("-- Total tests := %d \n", total_tests); @@ -71,147 +66,136 @@ public: printf("-- Tests Failed := %d \n", failed_tests); printf("\n\n Tests that failed : \n\n"); - for (int i = 0; i < failed_test_ids.size(); i++){ + for (int i = 0; i < failed_test_ids.size(); i++) { printf("*** Test = %s \n", failed_test_ids[i].c_str()); } } - }; - - - -void testTensorHgemm(UnitTestResults& unitTestResults){ +void testTensorHgemm(UnitTestResults &unitTestResults) { printf("***** TensorHgemm ***** \n\n"); - void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor* lhs = (struct Tensor*) lhs_ptr; + void *lhs_ptr = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); + struct Tensor *lhs = (struct Tensor *)lhs_ptr; fillTensorWithOnes(lhs); - - float* data_arr = (float*) lhs->host_data; - for(int i = 0; i < lhs->num_elems; i++){ + + float *data_arr = (float *)lhs->host_data; + for (int i = 0; i < lhs->num_elems; i++) { data_arr[i] = (i / 4) + 1; } - - void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); + + void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); fillTensorWithOnes(rhs); - - void* output = tensorHalfGemm(lhs, rhs); - convertToFP32((struct Tensor*) output); + + void *output = tensorHalfGemm(lhs, rhs); + convertToFP32((struct Tensor *)output); printTensorValues(output); - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20}; + const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, + 12, 16, 16, 16, 20, 20, 20}; - unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Hgemm"); + unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, + "Hgemm"); } - - -void testTensorSgemm(UnitTestResults& unitTestResults){ +void testTensorSgemm(UnitTestResults &unitTestResults) { printf("***** TensorSgemm ***** \n\n"); - void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor* lhs = (struct Tensor*) lhs_ptr; + void *lhs_ptr = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); + struct Tensor *lhs = (struct Tensor *)lhs_ptr; fillTensorWithOnes(lhs); - - float* data_arr = (float*) lhs->host_data; - for(int i = 0; i < lhs->num_elems; i++){ + + float *data_arr = (float *)lhs->host_data; + for (int i = 0; i < lhs->num_elems; i++) { data_arr[i] = (i / 4) + 1; } - void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); + void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); fillTensorWithOnes(rhs); - - void* output = tensorGemmGPU(lhs, rhs); - printTensorValues(output); - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20}; + void *output = tensorGemmGPU(lhs, rhs); + printTensorValues(output); - unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Sgemm"); + const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, + 12, 16, 16, 16, 20, 20, 20}; + unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, + "Sgemm"); } +void testTensorConcatAndSplit() { + int conv_mode = 1; // CROSS_CORRELATION mode + int compute_precision = 0; // floating point precision - - -void testTensorConcatAndSplit(){ - - int conv_mode = 1; // CROSS_CORRELATION mode - int compute_precision = 0; // floating point precision - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillWithOnesAndTwos(input); - void** splits = tensorSplit(input, 2, 1); + void **splits = tensorSplit(input, 2, 1); - void* conv2W = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); + void *conv2W = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); fillTensorWithOnes(conv2W); - - void** conv2fils = tensorSplit(conv2W, 2, 0); - void* conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, - 1, 1, conv_mode, compute_precision); + void **conv2fils = tensorSplit(conv2W, 2, 0); + + void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1, + conv_mode, compute_precision); printTensorDims(conv2a_out); - void* conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, - 1, 1, conv_mode, compute_precision); + void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1, + conv_mode, compute_precision); printTensorDims(conv2b_out); - - void* conv2_outs[2]; + + void *conv2_outs[2]; conv2_outs[0] = conv2a_out; conv2_outs[1] = conv2b_out; - void* conv2_concat_out = tensorConcat(conv2_outs, 2, 1); + void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1); printTensorDims(conv2_concat_out); printTensorValues(conv2_concat_out); - } +void testLRN() { - - - - -void testLRN(){ - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20); + void *input = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20); fillTensorWithOnes(input); unsigned LRN_window = 5; double LRN_alpha = 2e-05; printf("LRN_alpha = %f \n", LRN_alpha); - + double LRN_beta = 0.75; double LRN_k = 1.0; // TEST-point - Compare TF vs CUDNN - void* lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k); + void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k); printTensorDims(lrn1out); dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out); - void* input2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7); + void *input2 = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7); fillTensorWithOnes(input2); LRN_window = 5; LRN_alpha = 0.5 * LRN_window; - + LRN_beta = 0.75; LRN_k = 1.0; - void* lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k); + void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k); printTensorDims(lrn2out); - dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); + dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); } - - - -void testTensorAdd(){ +void testTensorAdd() { // Tensor add with equal dimensions - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); - void* bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); + void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); fillTensorWithOnes(x); fillTensorWithOnes(bias); @@ -222,8 +206,8 @@ void testTensorAdd(){ printTensorValues(x); // Tensor addd with matching channel dimension - void* x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2); - void* bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1); + void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2); + void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1); fillTensorWithOnes(x2); fillTensorWithOnes(bias2); @@ -231,209 +215,181 @@ void testTensorAdd(){ printTensorValues(x2); } +void testTensorConv() { -void testTensorError(){ - - // Tensor add with equal dimensions - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 128); - fillTensorWithOnes(x); - - Tensor* x_tensor = (Tensor*) x; - float* data_arr = (float*) x_tensor->host_data; - for(int i = 0; i < x_tensor->num_elems; i++){ - data_arr[i] = 0.2; - } - - tensorAddError(x, 3); - printTensorValues(x); -} - - -void testTensorConv(){ - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION + int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int compute_precision = 0; // floating point precision for conv - - void* conv_out = tensorConvolution(input, filter, 0, 0, - 1, 1, conv_mode, compute_precision); - printTensorValues(conv_out); + void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, + compute_precision); + printTensorValues(conv_out); } +void testTensorHalfConv() { -void testTensorHalfConv(){ - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION + int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int compute_precision = 0; // floating point precision for conv - - void* conv_out = tensorHalfConvolution(input, filter, 0, 0, - 1, 1, conv_mode, compute_precision); - printTensorValues(conv_out); + void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode, + compute_precision); + printTensorValues(conv_out); } +void testTensorGroupConv() { + // NOTE: The input channel count value (param2 to Tensor and Filter) must be + // the same + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - -void testTensorGroupConv(){ - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - - // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, val) + // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, + // val) fillTensorWithOnes(input); fillTensorWithOnes(filter); int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int conv_groups = 2; - - void* conv_out = tensorConvolution(input, filter, - 0, 0, - 1, 1, - conv_mode, conv_groups); + + void *conv_out = + tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); printTensorValues(conv_out); - } +void testTensorHalfGroupConv() { -void testTensorHalfGroupConv(){ - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); + // NOTE: The input channel count value (param2 to Tensor and Filter) must be + // the same + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int conv_groups = 2; - - void* conv_out = tensorConvolution(input, filter, - 0, 0, - 1, 1, - conv_mode, conv_groups); - - convertToFP32((struct Tensor*) conv_out); + + void *conv_out = + tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); + + convertToFP32((struct Tensor *)conv_out); printTensorValues(conv_out); } +void testTensorPooling() { -void testTensorPooling(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); fillTensorWithOnes(x); - float* data_arr = (float*) ((Tensor*) x)->host_data; - for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){ + float *data_arr = (float *)((Tensor *)x)->host_data; + for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { data_arr[i] = i; } - void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); + void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); printTensorValues(output); } +void testTensorHalfPooling() { -void testTensorHalfPooling(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); fillTensorWithOnes(x); - float* data_arr = (float*) ((Tensor*) x)->host_data; - for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){ + float *data_arr = (float *)((Tensor *)x)->host_data; + for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { data_arr[i] = i; } - void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); - convertToFP32((struct Tensor*) output); + void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); + convertToFP32((struct Tensor *)output); printTensorValues(output); } +void testTensorBatchNorm() { -void testTensorBatchNorm(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); fillTensorWithVal(x, 3); - void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(gamma, 1); - void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(beta, 0); - void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(mean, 1); - void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *variance = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(variance, 1); double epsilon = 1; // NOTE: result = X - mean / sqrt(epsilon + variance) - void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); + void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - printTensorValues(output); + printTensorValues(output); } +void testTensorHalfBatchNorm() { -void testTensorHalfBatchNorm(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); fillTensorWithVal(x, 3); - void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(gamma, 1); - void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(beta, 0); - void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(mean, 1); - void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *variance = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(variance, 1); - double epsilon = 1; // NOTE: result = X - mean / sqrt(epsilon + variance) - void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - convertToFP32((struct Tensor*) output); + void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); + convertToFP32((struct Tensor *)output); - printTensorValues(output); + printTensorValues(output); } +void testTensorRelu() { -void testTensorRelu(){ - - // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match + // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match printf("***** TensorRelu ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); fillTensorWithNegOnes(input); - void* output = tensorRelu(input); + void *output = tensorRelu(input); printTensorValues(output); } - -void testTensorSoftmax(){ +void testTensorSoftmax() { printf("***** TensorSoftmax ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; host_ptr[0] = 0.1; host_ptr[1] = 0.2; host_ptr[2] = 0.3; @@ -443,39 +399,36 @@ void testTensorSoftmax(){ host_ptr[6] = 0.7; host_ptr[7] = 2.5; - void* output = tensorSoftmax(input); + void *output = tensorSoftmax(input); printTensorValues(output); } +void testSoftmaxOutput(void *output_ptr) { -void testSoftmaxOutput(void* output_ptr){ + struct Tensor *output = (struct Tensor *)output_ptr; - struct Tensor* output = (struct Tensor*) output_ptr; - size_t batch_dim = output->dims.dim_sizes[0]; size_t channels = output->dims.dim_sizes[1]; - float* data = (float*) output->host_data; - for(int i = 0; i < batch_dim; i++){ + float *data = (float *)output->host_data; + for (int i = 0; i < batch_dim; i++) { float sum = 0.0; - for(int j = 0; j < channels; j++){ + for (int j = 0; j < channels; j++) { sum += data[i * channels + j]; } printf("output_sum = %f \n", sum); } - } - - -void testPromiseError(){ +void testPromiseError() { printf("***** TensorQuantize ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + float *host_ptr = (float *)((struct Tensor *)input)->host_data; - void* gold_tensor = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* gold_ptr = (float*) ((struct Tensor*) gold_tensor)->host_data; + void *gold_tensor = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data; gold_ptr[0] = -1; gold_ptr[1] = -2; @@ -490,21 +443,20 @@ void testPromiseError(){ gold_ptr[10] = 1; gold_ptr[11] = 1; - int num_elems = 12; int num_runs = 1000; - float* result_ptr = (float*) malloc(sizeof(float) * num_elems); + float *result_ptr = (float *)malloc(sizeof(float) * num_elems); - for (int swing = 1; swing <= 7; swing++){ + for (int swing = 1; swing <= 7; swing++) { - for (int j = 0; j < num_elems; j++){ - result_ptr[j] = 0; + for (int j = 0; j < num_elems; j++) { + result_ptr[j] = 0; } float error_sum = 0.0; - - for (int i = 0; i < 1000; i++){ + + for (int i = 0; i < 1000; i++) { host_ptr[0] = -1; host_ptr[1] = -2; host_ptr[2] = -3; @@ -517,43 +469,39 @@ void testPromiseError(){ host_ptr[9] = 2; host_ptr[10] = 1; host_ptr[11] = 1; - - void* error_out = addPromiseError(input, swing); - //printTensorValues(error_out); + + void *error_out = addPromiseError(input, swing); + // printTensorValues(error_out); // Move result data back to the host hpvm_request_tensor(input, 0); - float* error_out_ptr = (float*) ((struct Tensor*) input)->host_data; + float *error_out_ptr = (float *)((struct Tensor *)input)->host_data; - for (int j = 0; j < num_elems; j++){ - result_ptr[j] += error_out_ptr[j]; - error_sum += (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); + for (int j = 0; j < num_elems; j++) { + result_ptr[j] += error_out_ptr[j]; + error_sum += + (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); } } - printf ("\n\n - Swing %d results : \n", swing); - for (int j = 0; j < num_elems; j++){ + printf("\n\n - Swing %d results : \n", swing); + for (int j = 0; j < num_elems; j++) { result_ptr[j] = result_ptr[j] / num_runs; printf(" %f ", result_ptr[j]); } printf("mean_error = %f \n", error_sum / num_runs); - + printf(" \n"); } - - } - - - -void testQuantization(){ +void testQuantization() { printf("***** TensorQuantize ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; host_ptr[0] = -0.1; host_ptr[1] = -25; host_ptr[2] = 0.2; @@ -566,13 +514,12 @@ void testQuantization(){ host_ptr[9] = 7.2; host_ptr[10] = 2.5; host_ptr[11] = 3; - - void* quantize_result1 = quantizeTensorPromise(input, -4, 6); + void *quantize_result1 = quantizeTensorPromise(input, -4, 6); - printf ("\n ** quantizing with range min = %d max = %d \n", -4, 6); + printf("\n ** quantizing with range min = %d max = %d \n", -4, 6); printTensorValues(quantize_result1); - + host_ptr[0] = -0.1; host_ptr[1] = -25; host_ptr[2] = 0.2; @@ -586,9 +533,9 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; - void* quantize_result2 = quantizeTensorPromise(input, -2, 2); + void *quantize_result2 = quantizeTensorPromise(input, -2, 2); - printf ("\n ** quantizing with range min = %d max = %d \n", -2, 2); + printf("\n ** quantizing with range min = %d max = %d \n", -2, 2); printTensorValues(quantize_result2); host_ptr[0] = -0.1; @@ -604,13 +551,12 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; + void *quantize_result3 = quantizeTensorPromise(input, -25, 8); - void* quantize_result3 = quantizeTensorPromise(input, -25, 8); - - printf ("\n ** quantizing with range min = %d max = %d \n", -25, 8); + printf("\n ** quantizing with range min = %d max = %d \n", -25, 8); printTensorValues(quantize_result3); - printf ("\n ** quantizing with range min = %d max = %d \n", -10, 10); + printf("\n ** quantizing with range min = %d max = %d \n", -10, 10); host_ptr[0] = -0.1; host_ptr[1] = -25; @@ -625,30 +571,26 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; - - void* quantize_result4 = quantizeTensorPromise(input, -10, 10); + void *quantize_result4 = quantizeTensorPromise(input, -10, 10); printTensorValues(quantize_result4); - - void* quantize_result5 = quantizeTensorPromise(input, -10, 10); + void *quantize_result5 = quantizeTensorPromise(input, -10, 10); printTensorValues(quantize_result5); - - //void* error_out = addPromiseError(quantize_result, 1); - //printTensorValues(error_out); + // void* error_out = addPromiseError(quantize_result, 1); + // printTensorValues(error_out); } - - - -void testSampleFilter(){ +void testSampleFilter() { printf("***** Tensor Sample Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - //fillTensorWithVal(input, 3); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + // fillTensorWithVal(input, 3); fillWithOnesAndTwos(input); - - Tensor* input2 = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 2, 32, 32); + + Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, + 3, 2, 32, 32); fillTensorWithVal(input2, 1); /* float* host_ptr = (float*) ((struct Tensor*) input)->host_data; @@ -667,7 +609,7 @@ void testSampleFilter(){ /* printf("\n\n"); hpvm_request_tensor(input, DEVICE); - + sampleFilter(input, 2, 1); hpvm_request_tensor(input, HOST); @@ -675,116 +617,81 @@ void testSampleFilter(){ printTensorValues(input); */ - void* exact_res = tensorConvolution(input2, input, 0, 0, - 1, 1, 1, 1); + void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1); printTensorValues(exact_res); - - void* res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); - - //void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3); - - printTensorValues(res); - -} + void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); + // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3); + printTensorValues(res); +} - -void testPerforationCalls(void* input, void* filter, - int pad_h, int pad_w, - int stride_h, int stride_w, - int row, int col){ - +void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int row, int col) { float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++){ - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d row = %d col = %d offset= %d \n\n", - pad_h, pad_w, stride_h, stride_w, row, col, offset); - - - void* res_exact = tensorConvolution(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1); - - printf ("tensorConvolution Result :"); - printTensorValues(res_exact); + for (int offset = 0; offset < 2; offset++) { + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "row = %d col = %d offset= %d \n\n", + pad_h, pad_w, stride_h, stride_w, row, col, offset); - void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); - printf ("\nBaseline Result :"); - printTensorValues(res_exact2); + printf("tensorConvolution Result :"); + printTensorValues(res_exact); + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); - void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor*) res_exact3); + printf("\nBaseline Result :"); + printTensorValues(res_exact2); - printf ("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); - - void* res_sim = tensorConvPerfCuda(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - offset); + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); - printf ("\nConvPerfCuda Result :"); - printTensorValues(res_sim); + void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, offset); - - void* res = tensorConvApprox(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - 1, offset); + printf("\nConvPerfCuda Result :"); + printTensorValues(res_sim); + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, 1, offset); - printf ("\nConvApprox Result :"); - printTensorValues(res); + printf("\nConvApprox Result :"); + printTensorValues(res); + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, row, col, 1, offset); - void* res_half = tensorConvApproxHalf2(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - 1, offset); + convertToFP32((struct Tensor *)res_half); - convertToFP32((struct Tensor*) res_half); - - printf ("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); + } - } - - - printf ("\n\n\n--- End of Test \n\n\n"); + printf("\n\n\n--- End of Test \n\n\n"); } - - - - /**** Tests Perforation for a set of different inputs */ -void testPerforation(UnitTestResults& unitTestResults){ +void testPerforation(UnitTestResults &unitTestResults) { - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); /* float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; @@ -803,43 +710,33 @@ void testPerforation(UnitTestResults& unitTestResults){ host_ptr[24] = 2; host_ptr[26] = 2; */ - testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2); testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1); - testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3); testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1); - testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4); testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1); - } - - - - - - - - -void testSampling(){ +void testSampling() { printf("***** Testing Sampling ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); fillTensorWithVal(filter, 1); - float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; host_ptr[0] = 2; host_ptr[2] = 2; host_ptr[4] = 2; @@ -854,144 +751,124 @@ void testSampling(){ host_ptr[22] = 2; host_ptr[24] = 2; host_ptr[26] = 2; - //printTensorValues(input); + // printTensorValues(input); + + void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - void* res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - printTensorValues(res); + void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); - void* res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); - printTensorValues(res2); + void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); - void* res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); - printTensorValues(res2_sim); - - void* res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); - + void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); + printTensorValues(res3); + void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - void* res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - printTensorValues(res4); + void *res4_half = + tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - void* res4_half = tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - - convertToFP32((struct Tensor*) res4_half); + convertToFP32((struct Tensor *)res4_half); printTensorValues(res4_half); - } - - - -void testSamplingCalls(void* input, void* filter, - int pad_h, int pad_w, - int stride_h, int stride_w, - int skip_every, UnitTestResults& unitTestResults){ - +void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int skip_every, + UnitTestResults &unitTestResults) { float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++){ - - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d skip_every = %d offset= %d interpolation_rate = %f \n\n", - pad_h, pad_w, stride_h, stride_w, skip_every, offset, interpolation_rate); - - - void* res_exact = tensorConvolution(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1); + for (int offset = 0; offset < 2; offset++) { - printf ("tensorConvolution Result :"); - printTensorValues(res_exact); + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "skip_every = %d offset= %d interpolation_rate = %f \n\n", + pad_h, pad_w, stride_h, stride_w, skip_every, offset, + interpolation_rate); + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); - void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); + printf("tensorConvolution Result :"); + printTensorValues(res_exact); - printf ("\nBaseline Result :"); - printTensorValues(res_exact2); + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); + printf("\nBaseline Result :"); + printTensorValues(res_exact2); - void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor*) res_exact3); + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); - printf ("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); - - void* res_sim = tensorConvSampSim2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, skip_every, offset, interpolation_rate); + void *res_sim = + tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1, + 1, skip_every, offset, interpolation_rate); - printf ("\nConvSampSim Result :"); - printTensorValues(res_sim); + printf("\nConvSampSim Result :"); + printTensorValues(res_sim); - - void* res = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, skip_every, offset); + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, skip_every, offset); + printf("\nConvApprox Result :"); + printTensorValues(res); - printf ("\nConvApprox Result :"); - printTensorValues(res); + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, 1, 1, skip_every, offset); + convertToFP32((struct Tensor *)res_half); - void* res_half = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, skip_every, offset); + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); - convertToFP32((struct Tensor*) res_half); + std::string suffix = + std::string(" pad_h = ") + std::to_string(pad_h) + + std::string(" pad_w = ") + std::to_string(pad_w) + + std::string(" stride_h = ") + std::to_string(stride_h) + + std::string(" stride_w = ") + std::to_string(stride_w) + + std::string(" skip_every = ") + std::to_string(skip_every) + + std::string(" offset = ") + std::to_string(offset); - printf ("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); + std::string test_name = std::string("SAMP_FP32 ") + suffix; - std::string suffix = std::string(" pad_h = ") + std::to_string(pad_h) - + std::string(" pad_w = ") + std::to_string(pad_w) - + std::string(" stride_h = ") + std::to_string(stride_h) - + std::string(" stride_w = ") + std::to_string(stride_w) - + std::string(" skip_every = ") + std::to_string(skip_every) - + std::string(" offset = ") + std::to_string(offset); + unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.01, + test_name); - std::string test_name = std::string("SAMP_FP32 ") + suffix; - - unitTestResults.compareTensors((Tensor*) res, (Tensor*) res_sim, 0.01, test_name); + std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; + unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.04, + fp16_test_name); + } - std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; - unitTestResults.compareTensors((Tensor*) res_half, (Tensor*) res_sim, 0.04, fp16_test_name); - } - - - printf ("\n\n\n --- End of Test \n\n\n"); + printf("\n\n\n --- End of Test \n\n\n"); } - - /**** Tests Sample for a sample 3 * 3 Filter */ -void testSampling_3_3(UnitTestResults& unitTestResults){ +void testSampling_3_3(UnitTestResults &unitTestResults) { - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); + // fillWithOnesAndTwos(input); + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); - float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; host_ptr[0] = 2; host_ptr[2] = 2; host_ptr[4] = 2; @@ -1007,7 +884,6 @@ void testSampling_3_3(UnitTestResults& unitTestResults){ host_ptr[24] = 2; host_ptr[26] = 2; - // Tests with padding = 0 stride = 1 testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults); @@ -1028,27 +904,19 @@ void testSampling_3_3(UnitTestResults& unitTestResults){ testSamplingCalls(input, filter, 1, 1, 2, 2, 3, unitTestResults); testSamplingCalls(input, filter, 1, 1, 2, 2, 4, unitTestResults); - - } - - - - - - /**** Tests Sample for a sample 1 * 1 Filter */ -void testSampling_1_1(UnitTestResults& unitTestResults){ +void testSampling_1_1(UnitTestResults &unitTestResults) { - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); fillTensorWithVal(input, 2); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); fillTensorWithVal(filter, 2); - // Tests with padding = 0 stride = 1 testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults); @@ -1057,25 +925,20 @@ void testSampling_1_1(UnitTestResults& unitTestResults){ testSamplingCalls(input, filter, 0, 0, 1, 1, 4, unitTestResults); - // Tests with padding = 1 stride = 1 testSamplingCalls(input, filter, 1, 1, 1, 1, 2, unitTestResults); testSamplingCalls(input, filter, 1, 1, 1, 1, 3, unitTestResults); testSamplingCalls(input, filter, 1, 1, 1, 1, 4, unitTestResults); - - } +void *testTensorArgMax() { + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); - -void* testTensorArgMax(){ - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); - - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; // Input 0 host_ptr[0] = 1; @@ -1097,37 +960,34 @@ void* testTensorArgMax(){ host_ptr[10] = 2; host_ptr[11] = 8; - void* argmax_out = tensorArgMax(input); - - // Expect Output of call below to be: + void *argmax_out = tensorArgMax(input); + + // Expect Output of call below to be: // 1 2 2 0 printTensorValues(argmax_out); - return argmax_out; + return argmax_out; } +void *testTensorSelect(void *argmax_out) { - -void* testTensorSelect(void* argmax_out){ - - void* select_out = tensorSelect(argmax_out, 2); - printf ("***** tensorSelect output \n"); + void *select_out = tensorSelect(argmax_out, 2); + printf("***** tensorSelect output \n"); printTensorValues(select_out); - return select_out; - + return select_out; } +void testTensorContract(void *select_out) { -void testTensorContract(void* select_out){ - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); + float *host_ptr = (float *)((struct Tensor *)input)->host_data; // Input 0 host_ptr[0] = 1; - host_ptr[1] = 1; + host_ptr[1] = 1; host_ptr[2] = 1; host_ptr[3] = 1; @@ -1136,51 +996,38 @@ void testTensorContract(void* select_out){ host_ptr[5] = 2; host_ptr[6] = 2; host_ptr[7] = 2; - + // Input 2 host_ptr[8] = 3; host_ptr[9] = 3; - host_ptr[10] = 3; - host_ptr[11] = 3; + host_ptr[10] = 3; + host_ptr[11] = 3; // Input 3 - host_ptr[12] = 4; + host_ptr[12] = 4; host_ptr[13] = 4; host_ptr[14] = 4; host_ptr[15] = 4; - - void* contract_out = tensorContract(input, select_out); - printf ("***** tensorContract output \n"); + void *contract_out = tensorContract(input, select_out); + printf("***** tensorContract output \n"); printTensorValues(contract_out); - } +void testNewTensorOps() { - -void testNewTensorOps(){ - - void* argmax_out = testTensorArgMax(); - void* select_out = testTensorSelect(argmax_out); + void *argmax_out = testTensorArgMax(); + void *select_out = testTensorSelect(argmax_out); testTensorContract(select_out); - } - - - - - - - -int main(){ +int main() { llvm_hpvm_initTensorRt(0); - UnitTestResults unitTestResults; - + // Function call per unit test testTensorHgemm(unitTestResults); testTensorSgemm(unitTestResults); @@ -1199,31 +1046,26 @@ int main(){ testTensorHalfPooling(); */ - + testSampling_3_3(unitTestResults); testSampling_1_1(unitTestResults); testPerforation(unitTestResults); - - unitTestResults.printSummary(); - // testTensorError(); - // testQuantization(); + // testQuantization(); // testTensorGemm(); // testTensorGemmGPU(); - // testTensorGemmBias(); + // testTensorGemmBias(); // testTensorConv2(); // testTensorConv3(); // testLRN(); // testSampleFilter(); - // testNewTensorOps(); + // testNewTensorOps(); // testQuantization(); // testPromiseError(); - - + return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index 98d6d63eadc44b171b54bd09a9096d072c4be10d..1ca90cf6f724b5e42f3b8c774b23c25f7d294437 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -14,10 +14,10 @@ __global__ void convToGemmApproxHalf( // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -58,7 +58,7 @@ convToGemmPerfRow(float *const __restrict__ output, // number const int h = tx % (H_eff * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) + const int w = tx % W_out; // output width index (col number) int past_start = (h % (x - 1) >= (x - 1 - start)); const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride - V_pad; // input height index (row number) @@ -135,7 +135,7 @@ convToGemmPerfCol(float *const __restrict__ output, // number const int h = tx % (H_out * W_eff) / W_eff; // output height index (row // number) - const int w = tx % W_eff; // output width index (col number) + const int w = tx % W_eff; // output width index (col number) int past_start = (w % (x - 1)) >= (x - 1 - start); const int inH = h * V_stride - V_pad; // input height index (row number) const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride - @@ -394,7 +394,7 @@ __global__ void convToGemmPerfRowHalf( // number const int h = tx % (H_eff * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) + const int w = tx % W_out; // output width index (col number) int past_start = (h % (x - 1) >= (x - 1 - start)); const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride - V_pad; // input height index (row number) @@ -469,7 +469,7 @@ __global__ void convToGemmPerfColHalf( // number const int h = tx % (H_out * W_eff) / W_eff; // output height index (row // number) - const int w = tx % W_eff; // output width index (col number) + const int w = tx % W_eff; // output width index (col number) int past_start = (w % (x - 1)) >= (x - 1 - start); const int inH = h * V_stride - V_pad; // input height index (row number) const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride - @@ -557,10 +557,10 @@ __global__ void convToGemmApproxHalfN( // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -832,10 +832,10 @@ convToGemmHalfInput(__half *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -873,10 +873,10 @@ convToGemmHalfInput2(__half *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? const int filter_elem_num = c * KH * KW; for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) { int i = l / KW; @@ -1044,10 +1044,10 @@ convToGemmFullInput(float *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -1085,10 +1085,10 @@ convToGemmFullInput2(float *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? const int filter_elem_num = c * KH * KW; for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) { int i = l / KW; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h deleted file mode 100644 index 2545f07b48ddabfa6793f1d9eb01911542f4198e..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h +++ /dev/null @@ -1,324 +0,0 @@ -#ifndef APPROXHPVM_IMG_RUNTIME_UTILS -#define APPROXHPVM_IMG_RUNTIME_UTILS - -#include "configuration.h" -#include "hpvm-rt-controller.h" - -#include "img_tensor_runtime.h" - -// Utilities header for ApproxHPVM image runtime API (wrapper runtime API) - -void *handleTensorFftApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, bool inverse) { - - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorFft(input, inverse); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorFftHalf(input, inverse); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorFftHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorFftHalf", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -void *handleTensorReduceApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, size_t axis, MathOp func) { - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorReduce(input, axis, func, 0.0f); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorReduceHalf(input, axis, func, 0.0f); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReduceHalf", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING: { - void *t_out; - float skip_ratio; - bool is_half; - switch (param) { - case 41: - skip_ratio = 0.5f; - is_half = false; - break; - case 42: - skip_ratio = 0.5f; - is_half = true; - break; - case 43: - skip_ratio = 0.4f; - is_half = false; - break; - case 44: - skip_ratio = 0.4f; - is_half = true; - break; - case 45: - skip_ratio = 0.25f; - is_half = false; - break; - case 46: - skip_ratio = 0.25f; - is_half = true; - break; - default: - DEBUG("Unsupported Option: Select default, skip_ratio = 0.0.\n"); - skip_ratio = 0.0f; - is_half = false; - break; - } - RC->resume_profiler(); - if (is_half) - t_out = tensorReduceHalf(input, axis, func, skip_ratio); - else - t_out = tensorReduce(input, axis, func, skip_ratio); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - if (is_half) { - RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReduceHalf", - pinfo.second); - } else { - RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); - } - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -void *handleTensorProjectiveTApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - void *input, void *transformation) { - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorProjectiveT(input, transformation); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -void *handleTensorMap1ApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - MathOp func, void *input) { - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap1(func, input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap1Half(func, input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap1Half", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap1Half", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -void *handleTensorMap2ApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - MathOp func, void *input1, void *input2) { - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap2(func, input1, input2); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap2Half(func, input1, input2); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap2Half", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap2Half", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -void *handleTensorMap3ApproximationTuples( - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, - MathOp func, void *input1, void *input2, void *input3) { - if (approxTuples.size() == 1) { - enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; - int param = approxTuples[0].second; - switch (approx) { - case GPUNodeConfiguration::APPROX::FP32: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap3(func, input1, input2, input3); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16: { - void *t_out; - RC->resume_profiler(); - t_out = tensorMap3Half(func, input1, input2, input3); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorMap3Half", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorMap3Half", pinfo.second); - return t_out; - } - default: - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); - abort(); - } - return NULL; -} - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index 330d97600e6cdcf44bb93dbf28625cca8051c3ec..c318a8fb6aba604282cf709d09b6a6ef1a771f0e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -3,7 +3,6 @@ #ifndef APPROXHPVM_RUNTIME_UTILS #define APPROXHPVM_RUNTIME_UTILS - #include "tensor_runtime.h" #include "tensor_cpu_runtime.h" #include "configuration.h" @@ -17,30 +16,29 @@ //--- CPU Approximation handling ---// //----------------------------------------------------------------------------// -void* handleTensorAddApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* bias) { +void *handleTensorAddApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *bias) { -if (approxTuples.size() == 1) { + if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorAddCPU(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorAddCPU(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -53,32 +51,31 @@ if (approxTuples.size() == 1) { return NULL; } -void* handleTensorMulApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* lhs, void* rhs) { +void *handleTensorMulApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *lhs, void *rhs) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorGemmCPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorGemmCPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -89,79 +86,72 @@ void* handleTensorMulApproximationTuples_CPU( return NULL; } -void* handleTensorConvApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w) { +void *handleTensorConvApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); - return t_out; - } - case CPUNodeConfiguration::APPROX::PERFORATION : - { - PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, 1, params.skip_offset); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second); - return t_out; - } - case CPUNodeConfiguration::APPROX::INPUT_SAMPLING : - { - SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", - params.skip_rate, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = + tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::PERFORATION: { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", + pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, + params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -174,75 +164,73 @@ void* handleTensorConvApproximationTuples_CPU( return NULL; } -void* handleTensorGroupConvApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups) { +void *handleTensorGroupConvApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int conv_groups) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvCutlassCPU(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvCutlassCPU(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorBatchNormApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { +void *handleTensorBatchNormApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -254,161 +242,154 @@ void* handleTensorBatchNormApproximationTuples_CPU( return NULL; } -void* handleTensorReluApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorReluApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorReluCPU(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorReluCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorClippedReluApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, float min, float max) { +void *handleTensorClippedReluApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, float min, float max) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu2CPU(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu2CPU(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorTanhApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorTanhApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorTanhCPU(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorTanhCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorPoolingApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { +void *handleTensorPoolingApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, int poolFunction, int window_height, int window_width, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorPoolingCPU(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorPoolingCPU(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorSoftmaxApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr) { - void* t_out; +void *handleTensorSoftmaxApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr) { + void *t_out; RC->resume_profiler(); t_out = tensorSoftmaxCPU(input_ptr); RC->pause_profiler(); @@ -423,42 +404,40 @@ void* handleTensorSoftmaxApproximationTuples_CPU( //--- GPU Approximation handling ---// //----------------------------------------------------------------------------// -void* handleTensorAddApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* bias) { +void *handleTensorAddApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *bias) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -471,44 +450,42 @@ void* handleTensorAddApproximationTuples( return NULL; } -void* handleTensorMulApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* lhs, void* rhs) { +void *handleTensorMulApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *lhs, void *rhs) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -519,100 +496,88 @@ void* handleTensorMulApproximationTuples( return NULL; } -void* handleTensorConvApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w) { +void *handleTensorConvApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::PERFORATION : - case GPUNodeConfiguration::APPROX::PERFORATION_HP : - { - PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, 1, params.skip_offset); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING : - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP : - { - SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", - params.skip_rate, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = + tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", + pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::PERFORATION: + case GPUNodeConfiguration::APPROX::PERFORATION_HP: { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", + pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING: + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, + params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -625,103 +590,99 @@ void* handleTensorConvApproximationTuples( return NULL; } -void* handleTensorGroupConvApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups) { +void *handleTensorGroupConvApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int conv_groups) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvCutlass(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfConvCutlass(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, conv_mode, + conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorBatchNormApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { +void *handleTensorBatchNormApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -733,215 +694,202 @@ void* handleTensorBatchNormApproximationTuples( return NULL; } -void* handleTensorReluApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorReluApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorClippedReluApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, float min, float max) { +void *handleTensorClippedReluApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, float min, float max) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorTanhApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorTanhApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorPoolingApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { +void *handleTensorPoolingApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, int poolFunction, int window_height, int window_width, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorPooling(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfPooling(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorPooling(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfPooling(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorSoftmaxApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr) { - //TODO: if approximation choices are added for softmax operation, +void *handleTensorSoftmaxApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr) { + // TODO: if approximation choices are added for softmax operation, // implement this like the other handle* functions - void* t_out; + void *t_out; RC->resume_profiler(); t_out = tensorSoftmax(input_ptr); RC->pause_profiler(); @@ -952,5 +900,4 @@ void* handleTensorSoftmaxApproximationTuples( return t_out; } - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index b4f3d39fae77b214a46301ba7d6c95a5e651c44f..3b52cce9f62504753d63015a599d214194d48d98 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -144,7 +144,8 @@ public: // - energy // - accuracy (compared to golden output) // - accuracy loss (compared to baseline) -// - a hardware choice and set or operations-approximation choices, described in setup +// - a hardware choice and set or operations-approximation choices, described in +// setup struct Configuration { std::string name; float speedup; @@ -152,7 +153,7 @@ struct Configuration { float accuracy; float accuracyLoss; std::map<std::string, NodeConfiguration *> setup; - // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations + // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations std::map<int, NodeConfiguration *> idConfigMap; Configuration(std::string &n, float f, float e, float a, float al); @@ -171,8 +172,8 @@ struct Configuration { // Comparison operator definition, in increasing accuracy loss // (for std sort, used in pareto optimal computation) struct ConfigurationLessThan { - bool operator()( - const struct Configuration &a, const struct Configuration &b) const; + bool operator()(const struct Configuration &a, + const struct Configuration &b) const; }; // Comparison operator definition, in increasing accuracy loss diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h index 7724a49edf2465ee5e3d9ed5568ef2d87f943030..2c9f48203ba5d334e3c9bdd2409250cef47fa43b 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h @@ -5,6 +5,7 @@ #define LOG_DEBUG 1 // Sets the debug logging to true #define LOG_INFO 1 // Sets the info logging to true +#define LOG_ERROR 1 // Print Errors #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro) #include "tensor.h" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h index a3d51141acd9e45d3231689a39f43e97fbeb0a9f..8c9a711c8a8355eb7e0240cc6ed15b5c7ebd23c9 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/error.h @@ -45,21 +45,6 @@ __global__ void vecConstDiv(float *A, float div_factor, int n); __global__ void vecMul(float *A, float *B, int n); -/**** ERROR injecion routines ******/ -void initRandValues(Tensor *bias, int error_scale); - -void initRandValues2(Tensor *bias, int error_scale); - -void *addBitError(void *x_ptr, int error_scale); - -void randomCeilAndFloor(float *x, size_t num_elems); - -// Routine for Adding RoundOff Errors -void *addRoundError(void *x_ptr, int error_scale); - -// Routine for Adding Gaussian Error -void *addGaussianError(void *x_ptr, int error_scale); - void initPromiseRandValues(Tensor *bias, int error_scale); // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16 @@ -72,8 +57,6 @@ __global__ void quantizeAndClip(float *A, int n, float mul_factor, float min, __global__ void quantizeElem(float *A, int n, float mul_factor, float min); void *quantizeTensorPromise(void *input_ptr, float min, float max); - -void *tensorAddError(void *x_ptr, int error_scale); } #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h deleted file mode 100644 index 71099a89e4ff1c47a14c4652556838e55c3850ea..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h +++ /dev/null @@ -1,85 +0,0 @@ -/* -broadcast.h -Calculates shape of two tensors broadcasted together, using a numpy-like (but -weaker) rule. -*/ - -#ifndef FUNCTIONAL_BROADCAST_H -#define FUNCTIONAL_BROADCAST_H - -#include <algorithm> -#include <array> -#include <cstddef> -#include <type_traits> - -#include "common.h" -#include "debug.h" -#include "tensor.h" - -// TODO: don't accept N == 1 -template <size_t N, typename std::enable_if<N >= 1, int>::type = 0> -class BroadcastRemap { -public: - BroadcastRemap(const std::array<Tensor *, N> &tensors) - : out_sizes(), sizes() { - this->in_dims = tensors[0]->dims.num_dims; - for (size_t i = 0; i < N; i++) { - Tensor *t = tensors[i]; - this->sizes[i] = ::sizes(t); - if (this->in_dims != t->dims.num_dims) - ERROR("Broadcast tensors have different dimensions\n"); - this->tail_stride[i] = 1; - } - fill_broadcast_dims(); - } - - std::vector<size_t> getDim() const { return this->out_sizes; } - - const size_t *getStrides() const { return tail_stride; } - -private: - void fill_broadcast_dims() { - // Simplified broadcasting rule: - // 1. Tensors must have the same dimension that is greater than 1. - // 2. Dimension size being 1 (instead of equal) is only allowed for each - // tensor for a continuous N dimensions starting from the last one. - - // Assume all this->in_dims are 1, and compute - // out_dims is reverse-constructed - if (this->in_dims < 1) - ERROR("Broadcast tensors should have at least 1 dimension\n"); - bool broadcast_ended[N]{false}; - this->out_sizes.resize(this->in_dims, 1); - for (long i = this->in_dims - 1; i >= 0; i--) { - // First get tensors agree on dim size - for (size_t j = 0; j < N; j++) { - size_t this_size = this->sizes[j][i]; - if (this_size == 1) - continue; - if (this->out_sizes[i] != 1 && this->out_sizes[i] != this_size) - ERROR("Dimension size mismatch\n"); - this->out_sizes[i] = this_size; - } - } - for (size_t j = 0; j < N; j++) - for (long i = this->in_dims - 1; i >= 0; i--) { - // Check for continuity, calculate stride size - size_t this_size = this->sizes[j][i]; - if (this_size != 1) { - // Broadcast cannot go on anymore - broadcast_ended[j] = true; - continue; - } - if (this->out_sizes[i] != this_size && broadcast_ended[j]) - ERROR("Broadcast dims must be continuous\n"); - else - tail_stride[j] *= this->out_sizes[i]; - } - } - - size_t in_dims; - std::vector<size_t> out_sizes, sizes[N]; - size_t tail_stride[N]; -}; - -#endif // FUNCTIONAL_BROADCAST_H diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h deleted file mode 100644 index 00326bef03b78d905f5923ae3ab5a79f327c2e7b..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h +++ /dev/null @@ -1,119 +0,0 @@ -/* -common.h -Helper functions shared among functional/* header files and their corresponding -*.cu files. -These include util functions for CUDA, or functions on __device__, or Tensor -methods that really should be in `struct Tensor`. -*/ - -#ifndef FUNCTIONAL_COMMON_H -#define FUNCTIONAL_COMMON_H - -#include <cuda_fp16.h> -#include <cudnn.h> -#include <device_launch_parameters.h> -#include <typeinfo> -#include <vector> - -#include "debug.h" -#include "profiling.h" -#include "tensor.h" - -// Return ceil(a / b) for both host and device. -template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) { - return (a + b - 1) / b; -} - -// Profiles float -> half conversion, can be used like a guard. -template <typename T> class HFProfileGuard { - static const char *getEventName(bool end) { - if (typeid(T) == typeid(half) || typeid(T) == typeid(half2)) - return end ? "F2H_end" : "F2H_start"; - ERROR("Type not accepted\n"); - return ""; // For some compilers - } - - static bool needProfiling() { - // Only profile when given type T is half / half2. - // As this guard is often used in templated, scalar-type-agnostic - // implementation of an operator, this `T` is often that operator's scalar - // type. - return typeid(T) == typeid(half) || typeid(T) == typeid(half2); - } - -public: - HFProfileGuard() { - if (needProfiling()) - profileEvent(getEventName(false)); - } - - ~HFProfileGuard() { - if (needProfiling()) - profileEvent(getEventName(true)); - } -}; - -// Convert C++ type (given by template type T) to "tensor datatype", which is a -// enum that `struct Tensor` recognizes. -template <typename T> int getTensorType() { - if (typeid(T) == typeid(float)) - return (int)float_type; - else if (typeid(T) == typeid(half)) - return (int)half_type; - else if (typeid(T) == typeid(float2)) - return (int)float2_type; - else if (typeid(T) == typeid(half2)) - return (int)half2_type; - else { - ERROR("Unsupported type!\n"); - return 0; // For some compilers - } -} - -// Type-cast Tensor `t` to type `T` (regardless of what current type `t` has), -// and return a pointer to its underlying data on GPU (which can be t->gpu_data -// or t->gpu_half_data). -// This is specialized and implemented for float, float2 (float-complex), half, -// half2 (used for speeding up operations in half type) -template <typename T> T *convertAndGetGPUData(Tensor *t); - -template <> float *convertAndGetGPUData<float>(Tensor *t); - -template <> float2 *convertAndGetGPUData<float2>(Tensor *t); - -template <> half *convertAndGetGPUData<half>(Tensor *t); - -template <> half2 *convertAndGetGPUData<half2>(Tensor *t); - -// Like convertAndGetGPUData, but calls `convertToFP32_offline` instead of -// `convertToFP32`, which makes a difference when online / offline profiling is -// involved. -void convertToFloat2Offline(Tensor *t); - -// Return sizes of tensor with a vector. -std::vector<size_t> sizes(Tensor *t); - -std::vector<size_t> sizes(const Dimension &dim); - -// Return total number of element in a tensor. -size_t num_elems(const std::vector<size_t> &dim_sizes); - -size_t num_elems(const Dimension &dim); - -size_t num_elems(Tensor *t); - -// Checks equivalence of types t1 and t2 under the assumption that float=half -// and float2=half2, and returns the equalized type. -// 1. Define an equivalence operator (==): -// t == t = True -// float == half = True -// float2 == half2 = True -// otherwise = False -// and throws if t1 != t2. -// 2. Returns the same type `t`. But as float is not _actually_ the same thing -// as half, `get_half` determines wh which one to return. E.g. with t1 == -// float2, t2 == half, if get_half == true, half2 is returned, otherwise float2 -// is returned. -Tensor_type_t getCompatibleType(int t1, int t2, bool get_half); - -#endif // FUNCTIONAL_COMMON_H diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh deleted file mode 100644 index 74568d8183a7a64f48750b4d02a6286224cac817..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh +++ /dev/null @@ -1,119 +0,0 @@ -/* -map.cuh -Implementation of the map operator, including broadcast, the cuda kernel for -map, and a general map_n function in host code (which calls the kernel). -*/ -#ifndef FUNCTIONAL_MAP_H -#define FUNCTIONAL_MAP_H - -#include <array> -#include <cstddef> -#include <device_launch_parameters.h> -#include <type_traits> - -#include "broadcast.h" -#include "common.h" -#include "debug.h" -#include "map_typing.h" -#include "tensor.h" -#include "tensor_utils.h" - -// Checks dimension and data order of each map argument. -template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) { - for (Tensor *src : srcs) { - if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW) - ERROR("Not supported\n"); - } -} - -// CUDA kernel for map_n. This is _actually_ mostly unused as specialization for -// float / half exists for performance benefit. -template <typename Scalar, size_t N> -__global__ void kernelMapBroadcast( - Scalar *target, unsigned num_rows, void *func, Scalar **srcs, - size_t *tail_strides) { - auto *n_ary_op = (NTo1MapF<Scalar, N>)func; - - unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x, - stride = gridDim.x * blockDim.x; - Scalar buf[N]; - for (unsigned row = threadId; row < num_rows; row += stride) { - for (size_t j = 0; j < N; j++) - buf[j] = srcs[j][row / tail_strides[j]]; - target[row] = call_on_c_array<Scalar, Scalar, N>(n_ary_op, buf); - } -} - -// Instantiate float to compare fairly to half. Implemented for N = 1...3 -template <size_t N> -__global__ void kernelMapBroadcast<float, N>( - half *target, unsigned num_rows, void *func, half **srcs, - size_t *tail_strides); - -// Half uses a different implementation. Implemented for N = 1, 2 -template <size_t N> -__global__ void kernelMapBroadcast<half, N>( - half *target, unsigned num_rows, void *func, half **srcs, - size_t *tail_strides); - -// Create parameter for cuda kernel by copying pointers to device (gpu). -// This function unwraps BroadcastRemap into a cuda array of size N -- one value -// for the broadcast stride of each map argument, and unwraps `srcs` into their -// gpu data pointers. -template <typename Scalar, size_t N> -std::tuple<size_t *, Scalar **> make_cuda_params( - const BroadcastRemap<N> &br, const std::array<Tensor *, N> &srcs) { - for (Tensor *t : srcs) - hostToDeviceCopy(t); - std::array<Scalar *, N> gpu_datas; - { - HFProfileGuard<Scalar> g; - std::transform(srcs.begin(), srcs.end(), gpu_datas.begin(), [](Tensor *t) { - return convertAndGetGPUData<Scalar>(t); - }); - } - size_t *cuda_strides; - Scalar **cuda_gpu_data; - cudaMalloc(&cuda_strides, N * sizeof(size_t)); - cudaMemcpy( - cuda_strides, br.getStrides(), N * sizeof(size_t), - cudaMemcpyHostToDevice); - cudaMalloc(&cuda_gpu_data, N * sizeof(Scalar *)); - cudaMemcpy( - cuda_gpu_data, gpu_datas.data(), N * sizeof(size_t), - cudaMemcpyHostToDevice); - return std::make_tuple(cuda_strides, cuda_gpu_data); -} - -// Host code for map_n that check and converts the parameters, and calls the -// cuda kernel. -template < - typename Scalar, size_t N, typename std::enable_if<N >= 1, int>::type = 0> -__host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) { - mapPrecheck(srcs); - - auto br = BroadcastRemap<N>(srcs); - std::vector<size_t> dim_sizes = br.getDim(); - auto *target = (Tensor *)create4DTensor( - getTensorType<Scalar>(), CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - changeTensorPlacement(target, DEVICE); - void *func_ptr = mathOpToFunc<Scalar>(mop); - - size_t *cuda_strides; - Scalar **gpu_data; - std::tie(cuda_strides, gpu_data) = make_cuda_params<Scalar, N>(br, srcs); - - unsigned n_elem = num_elems(dim_sizes); - unsigned max_threads = 512, max_grid = 2048; - unsigned threads = std::min(max_threads, n_elem); - unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads)); - kernelMapBroadcast<Scalar, N><<<grids, threads>>>( - convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data, - cuda_strides); - cudaDeviceSynchronize(); - checkCUDA(cudaGetLastError()); - return target; -} - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h deleted file mode 100644 index 54d919b3346047285bb0b89c2c8d97f625738183..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h +++ /dev/null @@ -1,85 +0,0 @@ -/* -map_typing.h -Helper for metaprogramming used by map.cuh. -Defines some recursively templated types and functions. -*/ -#ifndef FUNCTIONAL_MAP_TYPING_H -#define FUNCTIONAL_MAP_TYPING_H - -#include <cstddef> -#include <device_launch_parameters.h> -#include <tuple> -#include <utility> - -namespace { -template <class T, size_t> using Type = T; - -template <typename, template <typename...> typename, typename> struct _RepNType; - -template <typename T, template <typename...> typename W, size_t... Is> -struct _RepNType<T, W, std::index_sequence<Is...>> { - using type = W<Type<T, Is>...>; -}; - -// Constructs type W<T, T, T, T, ... (N times)> from T and N -// RepNType T W N = W (T, T, T ... N times ..., T) -template <typename T, template <typename...> typename W, size_t N> -using RepNType = typename _RepNType<T, W, std::make_index_sequence<N>>::type; - -// Like std::function<Ret(Args...)> but denotes function raw pointer instead of -// lambda function -template <typename Ret, typename... Args> using FuncPtrT = Ret (*)(Args...); - -template <typename Ret, typename Arg, size_t N> struct _NAToBFunc { - template <typename... Args> using Wrapper = FuncPtrT<Ret, Args...>; - - using type = RepNType<Arg, Wrapper, N>; -}; -} // namespace - -// NAToBF Ret Arg N = Ret(*)(Arg, Arg, ...N times) -template <typename Ret, typename Arg, size_t N> -using NAToBF = typename _NAToBFunc<Ret, Arg, N>::type; - -// NTo1MapF Arg N = Arg(*)(Arg, Arg, ...N times) -// This denotes n-to-1 map: Arg x Arg x Arg x ... -> Arg. -template <typename Scalar, size_t N> using NTo1MapF = NAToBF<Scalar, Scalar, N>; - -// RepNTuple T N = std::tuple<Arg, Arg, ...N times> -template <typename T, size_t N> using RepNTuple = RepNType<T, std::tuple, N>; - -namespace { -template <typename TIterable, typename T, size_t... Is> -constexpr RepNTuple<T, sizeof...(Is)> as_tuple(TIterable arr, - std::index_sequence<Is...>) { - return std::make_tuple(arr[Is]...); -} - -template <typename Function, typename Tuple, size_t... I> -__device__ auto call(Function f, Tuple t, std::index_sequence<I...>) { - return f(std::get<I>(t)...); -} -} // namespace - -// Converts Iterable of type T and length N to (same-typed) tuple -// std::tuple<T, T, T, T, ...> -template <typename TIterable, typename T, size_t N> -constexpr RepNTuple<T, N> as_tuple(TIterable arr) { - return as_tuple<TIterable, T>(arr, std::make_index_sequence<N>{}); -} - -// Expands Tuple t into parameters of Function f, in python this would be -// f(*t). -template <typename Function, typename Tuple> -__device__ auto call_on_tuple(Function f, Tuple t) { - static constexpr auto size = std::tuple_size<Tuple>::value; - return call(f, t, std::make_index_sequence<size>{}); -} - -// Expands Array of type T and size N into parameters of Function -template <typename Ret, typename T, size_t N> -__device__ Ret call_on_c_array(NAToBF<Ret, T, N> f, const T arr[N]) { - return call_on_tuple(f, as_tuple<const T *, T, N>(arr)); -} - -#endif // FUNCTIONAL_MAP_TYPING_H diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh deleted file mode 100644 index 9f4fabfb5e0b75017e901c2cb4c60d8649b04f07..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh +++ /dev/null @@ -1,184 +0,0 @@ -/* -reduce.cuh -Implementation for reduce operator. -*/ -#include <device_launch_parameters.h> -#include <functional> -#include <numeric> - -#include "common.h" -#include "debug.h" -#include "tensor.h" -#include "tensor_utils.h" - -// Between CUDA compute capability 1.0 and 7.5, -// Least "max # threads per block" is 512, so 512 is used to be compatible; -// at most 2048 threads per multiprocessor, where # of cores varies greatly -// among devices. Titan X has 3072 cores, Quadro P1000 has 640. A bit of -// over-subscription doesn't hurt. These numbers will keep us compatible even -// for 1.0 devices. -constexpr size_t NThreads = 512, MaxNBlocks = 2048 / NThreads * 3072; -constexpr size_t MaxBlocksPerDim = 65535; - -constexpr size_t AlongDimTh = 16, CrossDimTh = 32; - -/* - * Reduce along one dimension with a single thread. - */ -template <typename K> -__device__ void reduceAlongDim( - K *target, K *src, K init, K identity, void *func, size_t num_irows, - size_t dim_size); - -template <> -__device__ void reduceAlongDim<float>( - float *target, float *src, float init, float identity, void *func, - size_t num_irows, size_t dim_size); - -template <> -__device__ void reduceAlongDim<half>( - half *target, half *src, half init, half identity, void *func, - size_t num_irows, size_t dim_size); - -/* - * Parallel reduce a dimension of tensor to a scalar value. - * Use `n_along_dim_threads` threads to sweep along the dim to be reduced. - * Intermediate values are collected in a divide-and-conquer manner, - * with thread 0 finally writing back the result. - */ -template <typename K> -__device__ void parallelReduceAlongDim( - K *target, K *src, K init, K identity, void *func, size_t num_irows, - size_t dim_size, size_t along_dim_tid, size_t n_along_dim_threads); - -template <> -__device__ void parallelReduceAlongDim<float>( - float *target, float *src, float init, float identity, void *func, - size_t num_irows, size_t dim_size, size_t along_dim_tid, - size_t n_along_dim_threads); - -template <> -__device__ void parallelReduceAlongDim<half>( - half *target, half *src, half init, half identity, void *func, - size_t num_irows, size_t dim_size, size_t along_dim_tid, - size_t n_along_dim_threads); - -/* - * Reduce the whole tensor with parallelism only on output. - * The reduce axis is reduced sequentially. - * Block is 2D, thread is 1D; block.y covers outer rows, block.x * thread.x - * covers inner rows. - */ -template <typename K> -__global__ void kernelReduceDimSeq( - K *target_, K *src_, K init, K identity, void *func, size_t num_irows, - size_t num_orows, size_t row_size, size_t approx_row_size) { - size_t start_orow = blockIdx.y, - start_irow = blockIdx.x * blockDim.x + threadIdx.x; - size_t orow_stride = gridDim.y, irow_stride = gridDim.x * blockDim.x; - for (size_t orow = start_orow; orow < num_orows; orow += orow_stride) { - for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) { - K *src = src_ + orow * row_size * num_irows + irow; - K *target = target_ + orow * num_irows + irow; - reduceAlongDim( - target, src, init, identity, func, num_irows, approx_row_size); - } - } -} - -/* - * Reduce the whole tensor with parallelism on output and reduce axis. - * I.e., the reduce axis is reduced parallel. - * Block is 2D, thread is 2D; - * thread.x covers reduce axis, - * block.x * thread.y covers inner rows, - * and block.y covers outer rows. - */ -template <typename K> -__global__ void __launch_bounds__(NThreads) kernelReduceDimParallel( - K *target_, K *src_, K init, K identity, void *func, size_t num_irows, - size_t num_orows, size_t row_size, size_t approx_row_size) { - size_t start_orow = blockIdx.y, - start_irow = blockIdx.x * blockDim.y + threadIdx.y; - size_t orow_stride = gridDim.y, irow_stride = gridDim.x * blockDim.y; - for (size_t orow = start_orow; orow < num_orows; orow += orow_stride) { - for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) { - K *src = src_ + orow * row_size * num_irows + irow; - K *target = target_ + orow * num_irows + irow; - parallelReduceAlongDim( - target, src, init, identity, func, num_irows, approx_row_size, - threadIdx.x, blockDim.x); - } - } -} - -/* Entry point for `reduce` implementation. Calls the right version of reduction - * kernel as needed. */ -template <typename Scalar> -__host__ Tensor *reduceDim( - Tensor *src, const Scalar &init, MathOp op, size_t axis, float skip_rate) { - // Copy input over - hostToDeviceCopy(src); - - // Prepare output - std::vector<size_t> in_sizes = sizes(src), out_sizes = in_sizes; - out_sizes[axis] = 1; - auto *target = (Tensor *)create4DTensor( - getTensorType<Scalar>(), CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1], - out_sizes[2], out_sizes[3]); - changeTensorPlacement(target, DEVICE); - - // Calculate schedule parameters - size_t num_orows = std::accumulate( - in_sizes.begin(), in_sizes.begin() + axis, 1, std::multiplies<>()); - size_t row_size = in_sizes[axis]; - size_t num_irows = std::accumulate( - in_sizes.begin() + axis + 1, in_sizes.end(), 1, std::multiplies<>()); - size_t num_rows = num_irows * num_orows; - - // Calculate approximation parameters - if (skip_rate != 0.0f) - INFO("Approximation happening...\n"); - size_t approx_row_size = (size_t)((1 - skip_rate) * row_size); - - void *func = mathOpToFunc<Scalar>(op); - Scalar identity = reduceOpToIdentity<Scalar>(op); - Scalar *src_data; - { - HFProfileGuard<Scalar> g; - src_data = convertAndGetGPUData<Scalar>(src); - } - - // If # of output entries is small, and row_size is enough for 16 threads, - // reduce in parallel. - // Remember if reducing dim in parallel, threads must be (16, 32). - if (num_rows < NThreads * MaxNBlocks && row_size >= AlongDimTh * 8) { - DEBUG( - "Reducing in parallel, row size = %lu, actually using %lu\n", row_size, - approx_row_size); - size_t grid_x = std::min(MaxBlocksPerDim, ceilDiv(num_irows, 32ul)); - size_t grid_y = std::min( - std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x)); - dim3 threads(AlongDimTh, CrossDimTh); - dim3 grid(grid_x, grid_y); - kernelReduceDimParallel<Scalar><<<grid, threads>>>( - convertAndGetGPUData<Scalar>(target), src_data, init, identity, func, - num_irows, num_orows, row_size, approx_row_size); - } else { - DEBUG( - "Reducing sequentially, row size = %lu, actually using %lu\n", row_size, - approx_row_size); - // Reduce sequentially. - size_t threads = std::min(NThreads, num_irows); - size_t grid_x = std::min(MaxBlocksPerDim, ceilDiv(num_irows, threads)); - size_t grid_y = std::min( - std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x)); - dim3 grid(grid_x, grid_y); - kernelReduceDimSeq<Scalar><<<grid, threads>>>( - convertAndGetGPUData<Scalar>(target), src_data, init, identity, func, - num_irows, num_orows, row_size, approx_row_size); - } - cudaDeviceSynchronize(); - checkCUDA(cudaGetLastError()); - return target; -} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h index f859b83e94ecc2b4e103792b977279613f119d71..49c1725336ab4242ba4ed852a10ba3cde0d1c5d7 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h @@ -20,14 +20,8 @@ #include "approx_knob_utils.h" #include "tensor.h" -#define ERROR_INJECTION_ENABLED 0 #define PROMISE_MODE 1 -#ifdef NO_INJECTION -#undef ERROR_INJECTION_ENABLED -#endif - -//#define ERROR_INJECTION_ENABLED 1 /* Data declarations */ extern cudnnHandle_t cudnnHandle; extern cublasHandle_t cublasHandle; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h deleted file mode 100644 index da7337008d8d39d65a45ab906155ed409b35a991..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h +++ /dev/null @@ -1,8258 +0,0 @@ -/* stb_image - v2.23 - public domain image loader - http://nothings.org/stb - no warranty implied; use at your own risk - - Do this: - #define STB_IMAGE_IMPLEMENTATION - before you include this file in *one* C or C++ file to create the -implementation. - - // i.e. it should look like this: - #include ... - #include ... - #include ... - #define STB_IMAGE_IMPLEMENTATION - #include "stb_image.h" - - You can #define STBI_ASSERT(x) before the #include to avoid using assert.h. - And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using -malloc,realloc,free - - - QUICK NOTES: - Primarily of interest to game developers and other people who can - avoid problematic images and only need the trivial interface - - JPEG baseline & progressive (12 bpc/arithmetic not supported, same as -stock IJG lib) PNG 1/2/4/8/16-bit-per-channel - - TGA (not sure what subset, if a subset) - BMP non-1bpp, non-RLE - PSD (composited view only, no extra channels, 8/16 bit-per-channel) - - GIF (*comp always reports as 4-channel) - HDR (radiance rgbE format) - PIC (Softimage PIC) - PNM (PPM and PGM binary only) - - Animated GIF still needs a proper API, but here's one way to do it: - http://gist.github.com/urraka/685d9a6340b26b830d49 - - - decode from memory or through FILE (define STBI_NO_STDIO to remove code) - - decode from arbitrary I/O callbacks - - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON) - - Full documentation under "DOCUMENTATION" below. - - -LICENSE - - See end of file for license information. - -RECENT REVISION HISTORY: - - 2.23 (2019-08-11) fix clang static analysis warning - 2.22 (2019-03-04) gif fixes, fix warnings - 2.21 (2019-02-25) fix typo in comment - 2.20 (2019-02-07) support utf8 filenames in Windows; fix warnings and -platform ifdefs 2.19 (2018-02-11) fix warning 2.18 (2018-01-30) fix warnings - 2.17 (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings - 2.16 (2017-07-23) all functions have 16-bit variants; optimizations; -bugfixes 2.15 (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE -detection on GCC 2.14 (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for -Imagenet JPGs 2.13 (2016-12-04) experimental 16-bit API, only for PNG so far; -fixes 2.12 (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11 -(2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64 RGB-format JPEG; remove -white matting in PSD; allocate large structures on the stack; correct channel -count for PNG & BMP 2.10 (2016-01-22) avoid warning introduced in 2.09 2.09 -(2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED - - See end of file for full revision history. - - - ============================ Contributors ========================= - - Image formats Extensions, features - Sean Barrett (jpeg, png, bmp) Jetro Lauha (stbi_info) - Nicolas Schulz (hdr, psd) Martin "SpartanJ" Golini (stbi_info) - Jonathan Dummer (tga) James "moose2000" Brown (iPhone PNG) - Jean-Marc Lienher (gif) Ben "Disch" Wenger (io callbacks) - Tom Seddon (pic) Omar Cornut (1/2/4-bit PNG) - Thatcher Ulrich (psd) Nicolas Guillemot (vertical flip) - Ken Miller (pgm, ppm) Richard Mitton (16-bit PSD) - github:urraka (animated gif) Junggon Kim (PNM comments) - Christopher Forseth (animated gif) Daniel Gibson (16-bit TGA) - socks-the-fox (16-bit PNG) - Jeremy Sawicki (handle all ImageNet -JPGs) Optimizations & bugfixes Mikhail Morozov (1-bit BMP) - Fabian "ryg" Giesen Anael Seghezzi (is-16-bit query) - Arseny Kapoulkine - John-Mark Allen - Carmelo J Fdez-Aguera - - Bug & warning fixes - Marc LeBlanc David Woo Guillaume George Martins -Mozeiko Christpher Lloyd Jerry Jansson Joseph Thomson Phil -Jordan Dave Moore Roy Eltham Hayaki Saito Nathan Reed - Won Chun Luke Graham Johan Duparc Nick Verigakis - the Horde3D community Thomas Ruf Ronny Chevalier github:rlyeh - Janez Zemva John Bartholomew Michal Cichon github:romigrou - Jonathan Blow Ken Hamada Tero Hanninen github:svdijk - Laurent Gomila Cort Stratton Sergio Gonzalez github:snagar - Aruelien Pocheville Thibault Reuille Cass Everitt github:Zelex - Ryamond Barbiero Paul Du Bois Engin Manap github:grim210 - Aldo Culquicondor Philipp Wiesemann Dale Weiler github:sammyhw - Oriol Ferrer Mesia Josh Tobin Matthew Gregan github:phprus - Julian Raschke Gregory Mullen Baldur Karlsson -github:poppolopoppo Christian Floisand Kevin Schmidt JR Smith -github:darealshinji Blazej Dariusz Roszkowski github:Michaelangel007 -*/ - -#ifndef STBI_INCLUDE_STB_IMAGE_H -#define STBI_INCLUDE_STB_IMAGE_H - -// DOCUMENTATION -// -// Limitations: -// - no 12-bit-per-channel JPEG -// - no JPEGs with arithmetic coding -// - GIF always returns *comp=4 -// -// Basic usage (see HDR discussion below for HDR usage): -// int x,y,n; -// unsigned char *data = stbi_load(filename, &x, &y, &n, 0); -// // ... process data if not NULL ... -// // ... x = width, y = height, n = # 8-bit components per pixel ... -// // ... replace '0' with '1'..'4' to force that many components per pixel -// // ... but 'n' will always be the number that it would have been if you -// said 0 stbi_image_free(data) -// -// Standard parameters: -// int *x -- outputs image width in pixels -// int *y -- outputs image height in pixels -// int *channels_in_file -- outputs # of image components in image file -// int desired_channels -- if non-zero, # of image components requested in -// result -// -// The return value from an image loader is an 'unsigned char *' which points -// to the pixel data, or NULL on an allocation failure or if the image is -// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels, -// with each pixel consisting of N interleaved 8-bit components; the first -// pixel pointed to is top-left-most in the image. There is no padding between -// image scanlines or between pixels, regardless of format. The number of -// components N is 'desired_channels' if desired_channels is non-zero, or -// *channels_in_file otherwise. If desired_channels is non-zero, -// *channels_in_file has the number of components that _would_ have been -// output otherwise. E.g. if you set desired_channels to 4, you will always -// get RGBA output, but you can check *channels_in_file to see if it's trivially -// opaque because e.g. there were only 3 channels in the source image. -// -// An output image with N components has the following components interleaved -// in this order in each pixel: -// -// N=#comp components -// 1 grey -// 2 grey, alpha -// 3 red, green, blue -// 4 red, green, blue, alpha -// -// If image loading fails for any reason, the return value will be NULL, -// and *x, *y, *channels_in_file will be unchanged. The function -// stbi_failure_reason() can be queried for an extremely brief, end-user -// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS -// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get -// slightly more user-friendly ones. -// -// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized. -// -// =========================================================================== -// -// UNICODE: -// -// If compiling for Windows and you wish to use Unicode filenames, compile -// with -// #define STBI_WINDOWS_UTF8 -// and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert -// Windows wchar_t filenames to utf8. -// -// =========================================================================== -// -// Philosophy -// -// stb libraries are designed with the following priorities: -// -// 1. easy to use -// 2. easy to maintain -// 3. good performance -// -// Sometimes I let "good performance" creep up in priority over "easy to -// maintain", and for best performance I may provide less-easy-to-use APIs that -// give higher performance, in addition to the easy-to-use ones. Nevertheless, -// it's important to keep in mind that from the standpoint of you, a client of -// this library, all you care about is #1 and #3, and stb libraries DO NOT -// emphasize #3 above all. -// -// Some secondary priorities arise directly from the first two, some of which -// provide more explicit reasons why performance can't be emphasized. -// -// - Portable ("ease of use") -// - Small source code footprint ("easy to maintain") -// - No dependencies ("ease of use") -// -// =========================================================================== -// -// I/O callbacks -// -// I/O callbacks allow you to read from arbitrary sources, like packaged -// files or some other source. Data read from callbacks are processed -// through a small internal buffer (currently 128 bytes) to try to reduce -// overhead. -// -// The three functions you must define are "read" (reads some bytes of data), -// "skip" (skips some bytes of data), "eof" (reports if the stream is at the -// end). -// -// =========================================================================== -// -// SIMD support -// -// The JPEG decoder will try to automatically use SIMD kernels on x86 when -// supported by the compiler. For ARM Neon support, you must explicitly -// request it. -// -// (The old do-it-yourself SIMD API is no longer supported in the current -// code.) -// -// On x86, SSE2 will automatically be used when available based on a run-time -// test; if not, the generic C versions are used as a fall-back. On ARM targets, -// the typical path is to have separate builds for NEON and non-NEON devices -// (at least this is true for iOS and Android). Therefore, the NEON support is -// toggled by a build flag: define STBI_NEON to get NEON loops. -// -// If for some reason you do not want to use any of SIMD code, or if -// you have issues compiling it, you can disable it entirely by -// defining STBI_NO_SIMD. -// -// =========================================================================== -// -// HDR image support (disable by defining STBI_NO_HDR) -// -// stb_image supports loading HDR images in general, and currently the Radiance -// .HDR file format specifically. You can still load any file through the -// existing interface; if you attempt to load an HDR file, it will be -// automatically remapped to LDR, assuming gamma 2.2 and an arbitrary scale -// factor defaulting to 1; both of these constants can be reconfigured through -// this interface: -// -// stbi_hdr_to_ldr_gamma(2.2f); -// stbi_hdr_to_ldr_scale(1.0f); -// -// (note, do not use _inverse_ constants; stbi_image will invert them -// appropriately). -// -// Additionally, there is a new, parallel interface for loading files as -// (linear) floats to preserve the full dynamic range: -// -// float *data = stbi_loadf(filename, &x, &y, &n, 0); -// -// If you load LDR images through this interface, those images will -// be promoted to floating point values, run through the inverse of -// constants corresponding to the above: -// -// stbi_ldr_to_hdr_scale(1.0f); -// stbi_ldr_to_hdr_gamma(2.2f); -// -// Finally, given a filename (or an open file or memory block--see header -// file for details) containing image data, you can query for the "most -// appropriate" interface to use (that is, whether the image is HDR or -// not), using: -// -// stbi_is_hdr(char *filename); -// -// =========================================================================== -// -// iPhone PNG support: -// -// By default we convert iphone-formatted PNGs back to RGB, even though -// they are internally encoded differently. You can disable this conversion -// by calling stbi_convert_iphone_png_to_rgb(0), in which case -// you will always just get the native iphone "format" through (which -// is BGR stored in RGB). -// -// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per -// pixel to remove any premultiplied alpha *only* if the image file explicitly -// says there's premultiplied data (currently only happens in iPhone images, -// and only if iPhone convert-to-rgb processing is on). -// -// =========================================================================== -// -// ADDITIONAL CONFIGURATION -// -// - You can suppress implementation of any of the decoders to reduce -// your code footprint by #defining one or more of the following -// symbols before creating the implementation. -// -// STBI_NO_JPEG -// STBI_NO_PNG -// STBI_NO_BMP -// STBI_NO_PSD -// STBI_NO_TGA -// STBI_NO_GIF -// STBI_NO_HDR -// STBI_NO_PIC -// STBI_NO_PNM (.ppm and .pgm) -// -// - You can request *only* certain decoders and suppress all other ones -// (this will be more forward-compatible, as addition of new decoders -// doesn't require you to disable them explicitly): -// -// STBI_ONLY_JPEG -// STBI_ONLY_PNG -// STBI_ONLY_BMP -// STBI_ONLY_PSD -// STBI_ONLY_TGA -// STBI_ONLY_GIF -// STBI_ONLY_HDR -// STBI_ONLY_PIC -// STBI_ONLY_PNM (.ppm and .pgm) -// -// - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still -// want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB -// - -#ifndef STBI_NO_STDIO -#include <stdio.h> -#endif // STBI_NO_STDIO - -#define STBI_VERSION 1 - -enum { - STBI_default = 0, // only used for desired_channels - - STBI_grey = 1, - STBI_grey_alpha = 2, - STBI_rgb = 3, - STBI_rgb_alpha = 4 -}; - -#include <stdlib.h> -typedef unsigned char stbi_uc; -typedef unsigned short stbi_us; - -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef STBIDEF -#ifdef STB_IMAGE_STATIC -#define STBIDEF static -#else -#define STBIDEF extern -#endif -#endif - -////////////////////////////////////////////////////////////////////////////// -// -// PRIMARY API - works on images of any type -// - -// -// load image by filename, open file, or memory buffer -// - -typedef struct { - int (*read)(void *user, char *data, - int size); // fill 'data' with 'size' bytes. return number of - // bytes actually read - void (*skip)(void *user, int n); // skip the next 'n' bytes, or 'unget' the - // last -n bytes if negative - int (*eof)(void *user); // returns nonzero if we are at end of file/data -} stbi_io_callbacks; - -//////////////////////////////////// -// -// 8-bits-per-channel interface -// - -STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *channels_in_file, - int desired_channels); -STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, - int *channels_in_file, - int desired_channels); - -#ifndef STBI_NO_STDIO -STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, - int *channels_in_file, int desired_channels); -STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, - int *channels_in_file, - int desired_channels); -// for stbi_load_from_file, file pointer is left pointing immediately after -// image -#endif - -#ifndef STBI_NO_GIF -STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, - int **delays, int *x, int *y, int *z, - int *comp, int req_comp); -#endif - -#ifdef STBI_WINDOWS_UTF8 -STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, - const wchar_t *input); -#endif - -//////////////////////////////////// -// -// 16-bits-per-channel interface -// - -STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, - int *x, int *y, int *channels_in_file, - int desired_channels); -STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, - int *channels_in_file, - int desired_channels); - -#ifndef STBI_NO_STDIO -STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, - int *channels_in_file, int desired_channels); -STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, - int *channels_in_file, - int desired_channels); -#endif - -//////////////////////////////////// -// -// float-per-channel interface -// -#ifndef STBI_NO_LINEAR -STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *channels_in_file, - int desired_channels); -STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, - int *channels_in_file, - int desired_channels); - -#ifndef STBI_NO_STDIO -STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, - int *channels_in_file, int desired_channels); -STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, - int *channels_in_file, - int desired_channels); -#endif -#endif - -#ifndef STBI_NO_HDR -STBIDEF void stbi_hdr_to_ldr_gamma(float gamma); -STBIDEF void stbi_hdr_to_ldr_scale(float scale); -#endif // STBI_NO_HDR - -#ifndef STBI_NO_LINEAR -STBIDEF void stbi_ldr_to_hdr_gamma(float gamma); -STBIDEF void stbi_ldr_to_hdr_scale(float scale); -#endif // STBI_NO_LINEAR - -// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR -STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, - void *user); -STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len); -#ifndef STBI_NO_STDIO -STBIDEF int stbi_is_hdr(char const *filename); -STBIDEF int stbi_is_hdr_from_file(FILE *f); -#endif // STBI_NO_STDIO - -// get a VERY brief reason for failure -// NOT THREADSAFE -STBIDEF const char *stbi_failure_reason(void); - -// free the loaded image -- this is just free() -STBIDEF void stbi_image_free(void *retval_from_stbi_load); - -// get image dimensions & components without fully decoding -STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *comp); -STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, - int *x, int *y, int *comp); -STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len); -STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, - void *user); - -#ifndef STBI_NO_STDIO -STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp); -STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp); -STBIDEF int stbi_is_16_bit(char const *filename); -STBIDEF int stbi_is_16_bit_from_file(FILE *f); -#endif - -// for image formats that explicitly notate that they have premultiplied alpha, -// we just return the colors as stored in the file. set this flag to force -// unpremultiplication. results are undefined if the unpremultiply overflow. -STBIDEF void -stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply); - -// indicate whether we should process iphone images back to canonical format, -// or just pass them through "as-is" -STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert); - -// flip the image vertically, so the first pixel in the output array is the -// bottom left -STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip); - -// ZLIB client - used by PNG, available for other purposes - -STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, - int initial_size, int *outlen); -STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, - int len, - int initial_size, - int *outlen, - int parse_header); -STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen); -STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, - const char *ibuffer, int ilen); - -STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, - int *outlen); -STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, - const char *ibuffer, int ilen); - -#ifdef __cplusplus -} -#endif - -// -// -//// end header file ///////////////////////////////////////////////////// -#endif // STBI_INCLUDE_STB_IMAGE_H - -#ifdef STB_IMAGE_IMPLEMENTATION - -#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || \ - defined(STBI_ONLY_BMP) || defined(STBI_ONLY_TGA) || \ - defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) || \ - defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || \ - defined(STBI_ONLY_PNM) || defined(STBI_ONLY_ZLIB) -#ifndef STBI_ONLY_JPEG -#define STBI_NO_JPEG -#endif -#ifndef STBI_ONLY_PNG -#define STBI_NO_PNG -#endif -#ifndef STBI_ONLY_BMP -#define STBI_NO_BMP -#endif -#ifndef STBI_ONLY_PSD -#define STBI_NO_PSD -#endif -#ifndef STBI_ONLY_TGA -#define STBI_NO_TGA -#endif -#ifndef STBI_ONLY_GIF -#define STBI_NO_GIF -#endif -#ifndef STBI_ONLY_HDR -#define STBI_NO_HDR -#endif -#ifndef STBI_ONLY_PIC -#define STBI_NO_PIC -#endif -#ifndef STBI_ONLY_PNM -#define STBI_NO_PNM -#endif -#endif - -#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && \ - !defined(STBI_NO_ZLIB) -#define STBI_NO_ZLIB -#endif - -#include <limits.h> -#include <stdarg.h> -#include <stddef.h> // ptrdiff_t on osx -#include <stdlib.h> -#include <string.h> - -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -#include <math.h> // ldexp, pow -#endif - -#ifndef STBI_NO_STDIO -#include <stdio.h> -#endif - -#ifndef STBI_ASSERT -#include <assert.h> -#define STBI_ASSERT(x) assert(x) -#endif - -#ifdef __cplusplus -#define STBI_EXTERN extern "C" -#else -#define STBI_EXTERN extern -#endif - -#ifndef _MSC_VER -#ifdef __cplusplus -#define stbi_inline inline -#else -#define stbi_inline -#endif -#else -#define stbi_inline __forceinline -#endif - -#ifdef _MSC_VER -typedef unsigned short stbi__uint16; -typedef signed short stbi__int16; -typedef unsigned int stbi__uint32; -typedef signed int stbi__int32; -#else -#include <stdint.h> -typedef uint16_t stbi__uint16; -typedef int16_t stbi__int16; -typedef uint32_t stbi__uint32; -typedef int32_t stbi__int32; -#endif - -// should produce compiler error if size is wrong -typedef unsigned char validate_uint32[sizeof(stbi__uint32) == 4 ? 1 : -1]; - -#ifdef _MSC_VER -#define STBI_NOTUSED(v) (void)(v) -#else -#define STBI_NOTUSED(v) (void)sizeof(v) -#endif - -#ifdef _MSC_VER -#define STBI_HAS_LROTL -#endif - -#ifdef STBI_HAS_LROTL -#define stbi_lrot(x, y) _lrotl(x, y) -#else -#define stbi_lrot(x, y) (((x) << (y)) | ((x) >> (32 - (y)))) -#endif - -#if defined(STBI_MALLOC) && defined(STBI_FREE) && \ - (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED)) -// ok -#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && \ - !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED) -// ok -#else -#error \ - "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)." -#endif - -#ifndef STBI_MALLOC -#define STBI_MALLOC(sz) malloc(sz) -#define STBI_REALLOC(p, newsz) realloc(p, newsz) -#define STBI_FREE(p) free(p) -#endif - -#ifndef STBI_REALLOC_SIZED -#define STBI_REALLOC_SIZED(p, oldsz, newsz) STBI_REALLOC(p, newsz) -#endif - -// x86/x64 detection -#if defined(__x86_64__) || defined(_M_X64) -#define STBI__X64_TARGET -#elif defined(__i386) || defined(_M_IX86) -#define STBI__X86_TARGET -#endif - -#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && \ - !defined(STBI_NO_SIMD) -// gcc doesn't support sse2 intrinsics unless you compile with -msse2, -// which in turn means it gets to use SSE2 everywhere. This is unfortunate, -// but previous attempts to provide the SSE2 functions with runtime -// detection caused numerous issues. The way architecture extensions are -// exposed in GCC/Clang is, sadly, not really suited for one-file libs. -// New behavior: if compiled with -msse2, we use SSE2 without any -// detection; if not, we don't use it at all. -#define STBI_NO_SIMD -#endif - -#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && \ - !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD) -// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid -// STBI__X64_TARGET -// -// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the -// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant. -// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not -// simultaneously enabling "-mstackrealign". -// -// See https://github.com/nothings/stb/issues/81 for more information. -// -// So default to no SSE2 on 32-bit MinGW. If you've read this far and added -// -mstackrealign to your build settings, feel free to #define -// STBI_MINGW_ENABLE_SSE2. -#define STBI_NO_SIMD -#endif - -#if !defined(STBI_NO_SIMD) && \ - (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) -#define STBI_SSE2 -#include <emmintrin.h> - -#ifdef _MSC_VER - -#if _MSC_VER >= 1400 // not VC6 -#include <intrin.h> // __cpuid -static int stbi__cpuid3(void) { - int info[4]; - __cpuid(info, 1); - return info[3]; -} -#else -static int stbi__cpuid3(void) { - int res; - __asm { - mov eax,1 - cpuid - mov res,edx - } - return res; -} -#endif - -#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name - -#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2) -static int stbi__sse2_available(void) { - int info3 = stbi__cpuid3(); - return ((info3 >> 26) & 1) != 0; -} -#endif - -#else // assume GCC-style if not VC++ -#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) - -#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2) -static int stbi__sse2_available(void) { - // If we're even attempting to compile this on GCC/Clang, that means - // -msse2 is on, which means the compiler is allowed to use SSE2 - // instructions at will, and so are we. - return 1; -} -#endif - -#endif -#endif - -// ARM NEON -#if defined(STBI_NO_SIMD) && defined(STBI_NEON) -#undef STBI_NEON -#endif - -#ifdef STBI_NEON -#include <arm_neon.h> -// assume GCC or Clang on ARM targets -#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) -#endif - -#ifndef STBI_SIMD_ALIGN -#define STBI_SIMD_ALIGN(type, name) type name -#endif - -/////////////////////////////////////////////// -// -// stbi__context struct and start_xxx functions - -// stbi__context structure is our basic context used by all images, so it -// contains all the IO context, plus some basic image information -typedef struct { - stbi__uint32 img_x, img_y; - int img_n, img_out_n; - - stbi_io_callbacks io; - void *io_user_data; - - int read_from_callbacks; - int buflen; - stbi_uc buffer_start[128]; - - stbi_uc *img_buffer, *img_buffer_end; - stbi_uc *img_buffer_original, *img_buffer_original_end; -} stbi__context; - -static void stbi__refill_buffer(stbi__context *s); - -// initialize a memory-decode context -static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len) { - s->io.read = NULL; - s->read_from_callbacks = 0; - s->img_buffer = s->img_buffer_original = (stbi_uc *)buffer; - s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *)buffer + len; -} - -// initialize a callback-based context -static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, - void *user) { - s->io = *c; - s->io_user_data = user; - s->buflen = sizeof(s->buffer_start); - s->read_from_callbacks = 1; - s->img_buffer_original = s->buffer_start; - stbi__refill_buffer(s); - s->img_buffer_original_end = s->img_buffer_end; -} - -#ifndef STBI_NO_STDIO - -static int stbi__stdio_read(void *user, char *data, int size) { - return (int)fread(data, 1, size, (FILE *)user); -} - -static void stbi__stdio_skip(void *user, int n) { - fseek((FILE *)user, n, SEEK_CUR); -} - -static int stbi__stdio_eof(void *user) { return feof((FILE *)user); } - -static stbi_io_callbacks stbi__stdio_callbacks = { - stbi__stdio_read, - stbi__stdio_skip, - stbi__stdio_eof, -}; - -static void stbi__start_file(stbi__context *s, FILE *f) { - stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *)f); -} - -// static void stop_file(stbi__context *s) { } - -#endif // !STBI_NO_STDIO - -static void stbi__rewind(stbi__context *s) { - // conceptually rewind SHOULD rewind to the beginning of the stream, - // but we just rewind to the beginning of the initial buffer, because - // we only use it after doing 'test', which only ever looks at at most 92 - // bytes - s->img_buffer = s->img_buffer_original; - s->img_buffer_end = s->img_buffer_original_end; -} - -enum { STBI_ORDER_RGB, STBI_ORDER_BGR }; - -typedef struct { - int bits_per_channel; - int num_channels; - int channel_order; -} stbi__result_info; - -#ifndef STBI_NO_JPEG -static int stbi__jpeg_test(stbi__context *s); -static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_PNG -static int stbi__png_test(stbi__context *s); -static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp); -static int stbi__png_is16(stbi__context *s); -#endif - -#ifndef STBI_NO_BMP -static int stbi__bmp_test(stbi__context *s); -static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_TGA -static int stbi__tga_test(stbi__context *s); -static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_PSD -static int stbi__psd_test(stbi__context *s); -static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri, int bpc); -static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp); -static int stbi__psd_is16(stbi__context *s); -#endif - -#ifndef STBI_NO_HDR -static int stbi__hdr_test(stbi__context *s); -static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_PIC -static int stbi__pic_test(stbi__context *s); -static void *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_GIF -static int stbi__gif_test(stbi__context *s); -static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, - int *z, int *comp, int req_comp); -static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -#ifndef STBI_NO_PNM -static int stbi__pnm_test(stbi__context *s); -static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri); -static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp); -#endif - -// this is not threadsafe -static const char *stbi__g_failure_reason; - -STBIDEF const char *stbi_failure_reason(void) { return stbi__g_failure_reason; } - -static int stbi__err(const char *str) { - stbi__g_failure_reason = str; - return 0; -} - -static void *stbi__malloc(size_t size) { return STBI_MALLOC(size); } - -// stb_image uses ints pervasively, including for offset calculations. -// therefore the largest decoded image size we can support with the -// current code, even on 64-bit targets, is INT_MAX. this is not a -// significant limitation for the intended use case. -// -// we do, however, need to make sure our size calculations don't -// overflow. hence a few helper functions for size calculations that -// multiply integers together, making sure that they're non-negative -// and no overflow occurs. - -// return 1 if the sum is valid, 0 on overflow. -// negative terms are considered invalid. -static int stbi__addsizes_valid(int a, int b) { - if (b < 0) - return 0; - // now 0 <= b <= INT_MAX, hence also - // 0 <= INT_MAX - b <= INTMAX. - // And "a + b <= INT_MAX" (which might overflow) is the - // same as a <= INT_MAX - b (no overflow) - return a <= INT_MAX - b; -} - -// returns 1 if the product is valid, 0 on overflow. -// negative factors are considered invalid. -static int stbi__mul2sizes_valid(int a, int b) { - if (a < 0 || b < 0) - return 0; - if (b == 0) - return 1; // mul-by-0 is always safe - // portable way to check for no overflows in a*b - return a <= INT_MAX / b; -} - -// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow -static int stbi__mad2sizes_valid(int a, int b, int add) { - return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add); -} - -// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow -static int stbi__mad3sizes_valid(int a, int b, int c, int add) { - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && - stbi__addsizes_valid(a * b * c, add); -} - -// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't -// overflow -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { - return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) && - stbi__mul2sizes_valid(a * b * c, d) && - stbi__addsizes_valid(a * b * c * d, add); -} -#endif - -// mallocs with size overflow checking -static void *stbi__malloc_mad2(int a, int b, int add) { - if (!stbi__mad2sizes_valid(a, b, add)) - return NULL; - return stbi__malloc(a * b + add); -} - -static void *stbi__malloc_mad3(int a, int b, int c, int add) { - if (!stbi__mad3sizes_valid(a, b, c, add)) - return NULL; - return stbi__malloc(a * b * c + add); -} - -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) -static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) { - if (!stbi__mad4sizes_valid(a, b, c, d, add)) - return NULL; - return stbi__malloc(a * b * c * d + add); -} -#endif - -// stbi__err - error -// stbi__errpf - error returning pointer to float -// stbi__errpuc - error returning pointer to unsigned char - -#ifdef STBI_NO_FAILURE_STRINGS -#define stbi__err(x, y) 0 -#elif defined(STBI_FAILURE_USERMSG) -#define stbi__err(x, y) stbi__err(y) -#else -#define stbi__err(x, y) stbi__err(x) -#endif - -#define stbi__errpf(x, y) ((float *)(size_t)(stbi__err(x, y) ? NULL : NULL)) -#define stbi__errpuc(x, y) \ - ((unsigned char *)(size_t)(stbi__err(x, y) ? NULL : NULL)) - -STBIDEF void stbi_image_free(void *retval_from_stbi_load) { - STBI_FREE(retval_from_stbi_load); -} - -#ifndef STBI_NO_LINEAR -static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp); -#endif - -#ifndef STBI_NO_HDR -static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp); -#endif - -static int stbi__vertically_flip_on_load = 0; - -STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) { - stbi__vertically_flip_on_load = flag_true_if_should_flip; -} - -static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri, int bpc) { - memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields - ri->bits_per_channel = - 8; // default is 8 so most paths don't have to be changed - ri->channel_order = - STBI_ORDER_RGB; // all current input & output are this, but this is here - // so we can add BGR order - ri->num_channels = 0; - -#ifndef STBI_NO_JPEG - if (stbi__jpeg_test(s)) - return stbi__jpeg_load(s, x, y, comp, req_comp, ri); -#endif -#ifndef STBI_NO_PNG - if (stbi__png_test(s)) - return stbi__png_load(s, x, y, comp, req_comp, ri); -#endif -#ifndef STBI_NO_BMP - if (stbi__bmp_test(s)) - return stbi__bmp_load(s, x, y, comp, req_comp, ri); -#endif -#ifndef STBI_NO_GIF - if (stbi__gif_test(s)) - return stbi__gif_load(s, x, y, comp, req_comp, ri); -#endif -#ifndef STBI_NO_PSD - if (stbi__psd_test(s)) - return stbi__psd_load(s, x, y, comp, req_comp, ri, bpc); -#endif -#ifndef STBI_NO_PIC - if (stbi__pic_test(s)) - return stbi__pic_load(s, x, y, comp, req_comp, ri); -#endif -#ifndef STBI_NO_PNM - if (stbi__pnm_test(s)) - return stbi__pnm_load(s, x, y, comp, req_comp, ri); -#endif - -#ifndef STBI_NO_HDR - if (stbi__hdr_test(s)) { - float *hdr = stbi__hdr_load(s, x, y, comp, req_comp, ri); - return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp); - } -#endif - -#ifndef STBI_NO_TGA - // test tga last because it's a crappy test! - if (stbi__tga_test(s)) - return stbi__tga_load(s, x, y, comp, req_comp, ri); -#endif - - return stbi__errpuc("unknown image type", - "Image not of any known type, or corrupt"); -} - -static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, - int channels) { - int i; - int img_len = w * h * channels; - stbi_uc *reduced; - - reduced = (stbi_uc *)stbi__malloc(img_len); - if (reduced == NULL) - return stbi__errpuc("outofmem", "Out of memory"); - - for (i = 0; i < img_len; ++i) - reduced[i] = - (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient - // approx of 16->8 bit scaling - - STBI_FREE(orig); - return reduced; -} - -static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, - int channels) { - int i; - int img_len = w * h * channels; - stbi__uint16 *enlarged; - - enlarged = (stbi__uint16 *)stbi__malloc(img_len * 2); - if (enlarged == NULL) - return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory"); - - for (i = 0; i < img_len; ++i) - enlarged[i] = (stbi__uint16)( - (orig[i] << 8) + - orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff - - STBI_FREE(orig); - return enlarged; -} - -static void stbi__vertical_flip(void *image, int w, int h, - int bytes_per_pixel) { - int row; - size_t bytes_per_row = (size_t)w * bytes_per_pixel; - stbi_uc temp[2048]; - stbi_uc *bytes = (stbi_uc *)image; - - for (row = 0; row < (h >> 1); row++) { - stbi_uc *row0 = bytes + row * bytes_per_row; - stbi_uc *row1 = bytes + (h - row - 1) * bytes_per_row; - // swap row0 with row1 - size_t bytes_left = bytes_per_row; - while (bytes_left) { - size_t bytes_copy = - (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); - memcpy(temp, row0, bytes_copy); - memcpy(row0, row1, bytes_copy); - memcpy(row1, temp, bytes_copy); - row0 += bytes_copy; - row1 += bytes_copy; - bytes_left -= bytes_copy; - } - } -} - -#ifndef STBI_NO_GIF -static void stbi__vertical_flip_slices(void *image, int w, int h, int z, - int bytes_per_pixel) { - int slice; - int slice_size = w * h * bytes_per_pixel; - - stbi_uc *bytes = (stbi_uc *)image; - for (slice = 0; slice < z; ++slice) { - stbi__vertical_flip(bytes, w, h, bytes_per_pixel); - bytes += slice_size; - } -} -#endif - -static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, - int *y, int *comp, - int req_comp) { - stbi__result_info ri; - void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8); - - if (result == NULL) - return NULL; - - if (ri.bits_per_channel != 8) { - STBI_ASSERT(ri.bits_per_channel == 16); - result = stbi__convert_16_to_8((stbi__uint16 *)result, *x, *y, - req_comp == 0 ? *comp : req_comp); - ri.bits_per_channel = 8; - } - - // @TODO: move stbi__convert_format to here - - if (stbi__vertically_flip_on_load) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); - } - - return (unsigned char *)result; -} - -static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, - int *y, int *comp, - int req_comp) { - stbi__result_info ri; - void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16); - - if (result == NULL) - return NULL; - - if (ri.bits_per_channel != 16) { - STBI_ASSERT(ri.bits_per_channel == 8); - result = stbi__convert_8_to_16((stbi_uc *)result, *x, *y, - req_comp == 0 ? *comp : req_comp); - ri.bits_per_channel = 16; - } - - // @TODO: move stbi__convert_format16 to here - // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to - // keep more precision - - if (stbi__vertically_flip_on_load) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); - } - - return (stbi__uint16 *)result; -} - -#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR) -static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, - int req_comp) { - if (stbi__vertically_flip_on_load && result != NULL) { - int channels = req_comp ? req_comp : *comp; - stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); - } -} -#endif - -#ifndef STBI_NO_STDIO - -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) -STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar( - unsigned int cp, unsigned long flags, const char *str, int cbmb, - wchar_t *widestr, int cchwide); -STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte( - unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, - char *str, int cbmb, const char *defchar, int *used_default); -#endif - -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) -STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, - const wchar_t *input) { - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, - (int)bufferlen, NULL, NULL); -} -#endif - -static FILE *stbi__fopen(char const *filename, char const *mode) { - FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) - wchar_t wMode[64]; - wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, - sizeof(wFilename))) - return 0; - - if (0 == - MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) - return 0; - -#if _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; -#else - f = _wfopen(wFilename, wMode); -#endif - -#elif defined(_MSC_VER) && _MSC_VER >= 1400 - if (0 != fopen_s(&f, filename, mode)) - f = 0; -#else - f = fopen(filename, mode); -#endif - return f; -} - -STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, - int req_comp) { - FILE *f = stbi__fopen(filename, "rb"); - unsigned char *result; - if (!f) - return stbi__errpuc("can't fopen", "Unable to open file"); - result = stbi_load_from_file(f, x, y, comp, req_comp); - fclose(f); - return result; -} - -STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, - int req_comp) { - unsigned char *result; - stbi__context s; - stbi__start_file(&s, f); - result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); - if (result) { - // need to 'unget' all the characters in the IO buffer - fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); - } - return result; -} - -STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, - int req_comp) { - stbi__uint16 *result; - stbi__context s; - stbi__start_file(&s, f); - result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp); - if (result) { - // need to 'unget' all the characters in the IO buffer - fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR); - } - return result; -} - -STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, - int req_comp) { - FILE *f = stbi__fopen(filename, "rb"); - stbi__uint16 *result; - if (!f) - return (stbi_us *)stbi__errpuc("can't fopen", "Unable to open file"); - result = stbi_load_from_file_16(f, x, y, comp, req_comp); - fclose(f); - return result; -} - -#endif //! STBI_NO_STDIO - -STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, - int *x, int *y, int *channels_in_file, - int desired_channels) { - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, - desired_channels); -} - -STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, - int *channels_in_file, - int desired_channels) { - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); - return stbi__load_and_postprocess_16bit(&s, x, y, channels_in_file, - desired_channels); -} - -STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *comp, int req_comp) { - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); -} - -STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, int *comp, - int req_comp) { - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); - return stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp); -} - -#ifndef STBI_NO_GIF -STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, - int **delays, int *x, int *y, int *z, - int *comp, int req_comp) { - unsigned char *result; - stbi__context s; - stbi__start_mem(&s, buffer, len); - - result = - (unsigned char *)stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp); - if (stbi__vertically_flip_on_load) { - stbi__vertical_flip_slices(result, *x, *y, *z, *comp); - } - - return result; -} -#endif - -#ifndef STBI_NO_LINEAR -static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, - int req_comp) { - unsigned char *data; -#ifndef STBI_NO_HDR - if (stbi__hdr_test(s)) { - stbi__result_info ri; - float *hdr_data = stbi__hdr_load(s, x, y, comp, req_comp, &ri); - if (hdr_data) - stbi__float_postprocess(hdr_data, x, y, comp, req_comp); - return hdr_data; - } -#endif - data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp); - if (data) - return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp); - return stbi__errpf("unknown image type", - "Image not of any known type, or corrupt"); -} - -STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *comp, int req_comp) { - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__loadf_main(&s, x, y, comp, req_comp); -} - -STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, - void *user, int *x, int *y, int *comp, - int req_comp) { - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); - return stbi__loadf_main(&s, x, y, comp, req_comp); -} - -#ifndef STBI_NO_STDIO -STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, - int req_comp) { - float *result; - FILE *f = stbi__fopen(filename, "rb"); - if (!f) - return stbi__errpf("can't fopen", "Unable to open file"); - result = stbi_loadf_from_file(f, x, y, comp, req_comp); - fclose(f); - return result; -} - -STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, - int req_comp) { - stbi__context s; - stbi__start_file(&s, f); - return stbi__loadf_main(&s, x, y, comp, req_comp); -} -#endif // !STBI_NO_STDIO - -#endif // !STBI_NO_LINEAR - -// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is -// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always -// reports false! - -STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) { -#ifndef STBI_NO_HDR - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__hdr_test(&s); -#else - STBI_NOTUSED(buffer); - STBI_NOTUSED(len); - return 0; -#endif -} - -#ifndef STBI_NO_STDIO -STBIDEF int stbi_is_hdr(char const *filename) { - FILE *f = stbi__fopen(filename, "rb"); - int result = 0; - if (f) { - result = stbi_is_hdr_from_file(f); - fclose(f); - } - return result; -} - -STBIDEF int stbi_is_hdr_from_file(FILE *f) { -#ifndef STBI_NO_HDR - long pos = ftell(f); - int res; - stbi__context s; - stbi__start_file(&s, f); - res = stbi__hdr_test(&s); - fseek(f, pos, SEEK_SET); - return res; -#else - STBI_NOTUSED(f); - return 0; -#endif -} -#endif // !STBI_NO_STDIO - -STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, - void *user) { -#ifndef STBI_NO_HDR - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user); - return stbi__hdr_test(&s); -#else - STBI_NOTUSED(clbk); - STBI_NOTUSED(user); - return 0; -#endif -} - -#ifndef STBI_NO_LINEAR -static float stbi__l2h_gamma = 2.2f, stbi__l2h_scale = 1.0f; - -STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; } -STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; } -#endif - -static float stbi__h2l_gamma_i = 1.0f / 2.2f, stbi__h2l_scale_i = 1.0f; - -STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { - stbi__h2l_gamma_i = 1 / gamma; -} -STBIDEF void stbi_hdr_to_ldr_scale(float scale) { - stbi__h2l_scale_i = 1 / scale; -} - -////////////////////////////////////////////////////////////////////////////// -// -// Common code used by all image loaders -// - -enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header }; - -static void stbi__refill_buffer(stbi__context *s) { - int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen); - if (n == 0) { - // at end of file, treat same as if from memory, but need to handle case - // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file - s->read_from_callbacks = 0; - s->img_buffer = s->buffer_start; - s->img_buffer_end = s->buffer_start + 1; - *s->img_buffer = 0; - } else { - s->img_buffer = s->buffer_start; - s->img_buffer_end = s->buffer_start + n; - } -} - -stbi_inline static stbi_uc stbi__get8(stbi__context *s) { - if (s->img_buffer < s->img_buffer_end) - return *s->img_buffer++; - if (s->read_from_callbacks) { - stbi__refill_buffer(s); - return *s->img_buffer++; - } - return 0; -} - -stbi_inline static int stbi__at_eof(stbi__context *s) { - if (s->io.read) { - if (!(s->io.eof)(s->io_user_data)) - return 0; - // if feof() is true, check if buffer = end - // special case: we've only got the special 0 character at the end - if (s->read_from_callbacks == 0) - return 1; - } - - return s->img_buffer >= s->img_buffer_end; -} - -static void stbi__skip(stbi__context *s, int n) { - if (n < 0) { - s->img_buffer = s->img_buffer_end; - return; - } - if (s->io.read) { - int blen = (int)(s->img_buffer_end - s->img_buffer); - if (blen < n) { - s->img_buffer = s->img_buffer_end; - (s->io.skip)(s->io_user_data, n - blen); - return; - } - } - s->img_buffer += n; -} - -static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n) { - if (s->io.read) { - int blen = (int)(s->img_buffer_end - s->img_buffer); - if (blen < n) { - int res, count; - - memcpy(buffer, s->img_buffer, blen); - - count = (s->io.read)(s->io_user_data, (char *)buffer + blen, n - blen); - res = (count == (n - blen)); - s->img_buffer = s->img_buffer_end; - return res; - } - } - - if (s->img_buffer + n <= s->img_buffer_end) { - memcpy(buffer, s->img_buffer, n); - s->img_buffer += n; - return 1; - } else - return 0; -} - -static int stbi__get16be(stbi__context *s) { - int z = stbi__get8(s); - return (z << 8) + stbi__get8(s); -} - -static stbi__uint32 stbi__get32be(stbi__context *s) { - stbi__uint32 z = stbi__get16be(s); - return (z << 16) + stbi__get16be(s); -} - -#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) -// nothing -#else -static int stbi__get16le(stbi__context *s) { - int z = stbi__get8(s); - return z + (stbi__get8(s) << 8); -} -#endif - -#ifndef STBI_NO_BMP -static stbi__uint32 stbi__get32le(stbi__context *s) { - stbi__uint32 z = stbi__get16le(s); - return z + (stbi__get16le(s) << 16); -} -#endif - -#define STBI__BYTECAST(x) \ - ((stbi_uc)((x)&255)) // truncate int to byte without warnings - -////////////////////////////////////////////////////////////////////////////// -// -// generic converter from built-in img_n to req_comp -// individual types do this automatically as much as possible (e.g. jpeg -// does all cases internally since it needs to colorspace convert anyway, -// and it never has alpha, so very few cases ). png can automatically -// interleave an alpha=255 channel, but falls back to this for other cases -// -// assume data buffer is malloced, so malloc a new one and free that one -// only failure mode is malloc failing - -static stbi_uc stbi__compute_y(int r, int g, int b) { - return (stbi_uc)(((r * 77) + (g * 150) + (29 * b)) >> 8); -} - -static unsigned char *stbi__convert_format(unsigned char *data, int img_n, - int req_comp, unsigned int x, - unsigned int y) { - int i, j; - unsigned char *good; - - if (req_comp == img_n) - return data; - STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - - good = (unsigned char *)stbi__malloc_mad3(req_comp, x, y, 0); - if (good == NULL) { - STBI_FREE(data); - return stbi__errpuc("outofmem", "Out of memory"); - } - - for (j = 0; j < (int)y; ++j) { - unsigned char *src = data + j * x * img_n; - unsigned char *dest = good + j * x * req_comp; - -#define STBI__COMBO(a, b) ((a)*8 + (b)) -#define STBI__CASE(a, b) \ - case STBI__COMBO(a, b): \ - for (i = x - 1; i >= 0; --i, src += a, dest += b) - // convert source image with img_n components to one with req_comp - // components; avoid switch per pixel, so use switch per scanline and - // massive macros - switch (STBI__COMBO(img_n, req_comp)) { - STBI__CASE(1, 2) { - dest[0] = src[0]; - dest[1] = 255; - } - break; - STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } - break; - STBI__CASE(1, 4) { - dest[0] = dest[1] = dest[2] = src[0]; - dest[3] = 255; - } - break; - STBI__CASE(2, 1) { dest[0] = src[0]; } - break; - STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } - break; - STBI__CASE(2, 4) { - dest[0] = dest[1] = dest[2] = src[0]; - dest[3] = src[1]; - } - break; - STBI__CASE(3, 4) { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - dest[3] = 255; - } - break; - STBI__CASE(3, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); } - break; - STBI__CASE(3, 2) { - dest[0] = stbi__compute_y(src[0], src[1], src[2]); - dest[1] = 255; - } - break; - STBI__CASE(4, 1) { dest[0] = stbi__compute_y(src[0], src[1], src[2]); } - break; - STBI__CASE(4, 2) { - dest[0] = stbi__compute_y(src[0], src[1], src[2]); - dest[1] = src[3]; - } - break; - STBI__CASE(4, 3) { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - } - break; - default: - STBI_ASSERT(0); - } -#undef STBI__CASE - } - - STBI_FREE(data); - return good; -} - -static stbi__uint16 stbi__compute_y_16(int r, int g, int b) { - return (stbi__uint16)(((r * 77) + (g * 150) + (29 * b)) >> 8); -} - -static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, - int req_comp, unsigned int x, - unsigned int y) { - int i, j; - stbi__uint16 *good; - - if (req_comp == img_n) - return data; - STBI_ASSERT(req_comp >= 1 && req_comp <= 4); - - good = (stbi__uint16 *)stbi__malloc(req_comp * x * y * 2); - if (good == NULL) { - STBI_FREE(data); - return (stbi__uint16 *)stbi__errpuc("outofmem", "Out of memory"); - } - - for (j = 0; j < (int)y; ++j) { - stbi__uint16 *src = data + j * x * img_n; - stbi__uint16 *dest = good + j * x * req_comp; - -#define STBI__COMBO(a, b) ((a)*8 + (b)) -#define STBI__CASE(a, b) \ - case STBI__COMBO(a, b): \ - for (i = x - 1; i >= 0; --i, src += a, dest += b) - // convert source image with img_n components to one with req_comp - // components; avoid switch per pixel, so use switch per scanline and - // massive macros - switch (STBI__COMBO(img_n, req_comp)) { - STBI__CASE(1, 2) { - dest[0] = src[0]; - dest[1] = 0xffff; - } - break; - STBI__CASE(1, 3) { dest[0] = dest[1] = dest[2] = src[0]; } - break; - STBI__CASE(1, 4) { - dest[0] = dest[1] = dest[2] = src[0]; - dest[3] = 0xffff; - } - break; - STBI__CASE(2, 1) { dest[0] = src[0]; } - break; - STBI__CASE(2, 3) { dest[0] = dest[1] = dest[2] = src[0]; } - break; - STBI__CASE(2, 4) { - dest[0] = dest[1] = dest[2] = src[0]; - dest[3] = src[1]; - } - break; - STBI__CASE(3, 4) { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - dest[3] = 0xffff; - } - break; - STBI__CASE(3, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); } - break; - STBI__CASE(3, 2) { - dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); - dest[1] = 0xffff; - } - break; - STBI__CASE(4, 1) { dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); } - break; - STBI__CASE(4, 2) { - dest[0] = stbi__compute_y_16(src[0], src[1], src[2]); - dest[1] = src[3]; - } - break; - STBI__CASE(4, 3) { - dest[0] = src[0]; - dest[1] = src[1]; - dest[2] = src[2]; - } - break; - default: - STBI_ASSERT(0); - } -#undef STBI__CASE - } - - STBI_FREE(data); - return good; -} - -#ifndef STBI_NO_LINEAR -static float *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp) { - int i, k, n; - float *output; - if (!data) - return NULL; - output = (float *)stbi__malloc_mad4(x, y, comp, sizeof(float), 0); - if (output == NULL) { - STBI_FREE(data); - return stbi__errpf("outofmem", "Out of memory"); - } - // compute number of non-alpha components - if (comp & 1) - n = comp; - else - n = comp - 1; - for (i = 0; i < x * y; ++i) { - for (k = 0; k < n; ++k) { - output[i * comp + k] = - (float)(pow(data[i * comp + k] / 255.0f, stbi__l2h_gamma) * - stbi__l2h_scale); - } - } - if (n < comp) { - for (i = 0; i < x * y; ++i) { - output[i * comp + n] = data[i * comp + n] / 255.0f; - } - } - STBI_FREE(data); - return output; -} -#endif - -#ifndef STBI_NO_HDR -#define stbi__float2int(x) ((int)(x)) -static stbi_uc *stbi__hdr_to_ldr(float *data, int x, int y, int comp) { - int i, k, n; - stbi_uc *output; - if (!data) - return NULL; - output = (stbi_uc *)stbi__malloc_mad3(x, y, comp, 0); - if (output == NULL) { - STBI_FREE(data); - return stbi__errpuc("outofmem", "Out of memory"); - } - // compute number of non-alpha components - if (comp & 1) - n = comp; - else - n = comp - 1; - for (i = 0; i < x * y; ++i) { - for (k = 0; k < n; ++k) { - float z = (float)pow(data[i * comp + k] * stbi__h2l_scale_i, - stbi__h2l_gamma_i) * - 255 + - 0.5f; - if (z < 0) - z = 0; - if (z > 255) - z = 255; - output[i * comp + k] = (stbi_uc)stbi__float2int(z); - } - if (k < comp) { - float z = data[i * comp + k] * 255 + 0.5f; - if (z < 0) - z = 0; - if (z > 255) - z = 255; - output[i * comp + k] = (stbi_uc)stbi__float2int(z); - } - } - STBI_FREE(data); - return output; -} -#endif - -////////////////////////////////////////////////////////////////////////////// -// -// "baseline" JPEG/JFIF decoder -// -// simple implementation -// - doesn't support delayed output of y-dimension -// - simple interface (only one output format: 8-bit interleaved RGB) -// - doesn't try to recover corrupt jpegs -// - doesn't allow partial loading, loading multiple at once -// - still fast on x86 (copying globals into locals doesn't help x86) -// - allocates lots of intermediate memory (full size of all components) -// - non-interleaved case requires this anyway -// - allows good upsampling (see next) -// high-quality -// - upsampled channels are bilinearly interpolated, even across blocks -// - quality integer IDCT derived from IJG's 'slow' -// performance -// - fast huffman; reasonable integer IDCT -// - some SIMD kernels for common paths on targets with SSE2/NEON -// - uses a lot of intermediate memory, could cache poorly - -#ifndef STBI_NO_JPEG - -// huffman decoding acceleration -#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache - -typedef struct { - stbi_uc fast[1 << FAST_BITS]; - // weirdly, repacking this into AoS is a 10% speed loss, instead of a win - stbi__uint16 code[256]; - stbi_uc values[256]; - stbi_uc size[257]; - unsigned int maxcode[18]; - int delta[17]; // old 'firstsymbol' - old 'firstcode' -} stbi__huffman; - -typedef struct { - stbi__context *s; - stbi__huffman huff_dc[4]; - stbi__huffman huff_ac[4]; - stbi__uint16 dequant[4][64]; - stbi__int16 fast_ac[4][1 << FAST_BITS]; - - // sizes for components, interleaved MCUs - int img_h_max, img_v_max; - int img_mcu_x, img_mcu_y; - int img_mcu_w, img_mcu_h; - - // definition of jpeg image component - struct { - int id; - int h, v; - int tq; - int hd, ha; - int dc_pred; - - int x, y, w2, h2; - stbi_uc *data; - void *raw_data, *raw_coeff; - stbi_uc *linebuf; - short *coeff; // progressive only - int coeff_w, coeff_h; // number of 8x8 coefficient blocks - } img_comp[4]; - - stbi__uint32 code_buffer; // jpeg entropy-coded buffer - int code_bits; // number of valid bits - unsigned char marker; // marker seen while filling entropy buffer - int nomore; // flag if we saw a marker so must stop - - int progressive; - int spec_start; - int spec_end; - int succ_high; - int succ_low; - int eob_run; - int jfif; - int app14_color_transform; // Adobe APP14 tag - int rgb; - - int scan_n, order[4]; - int restart_interval, todo; - - // kernels - void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]); - void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, - const stbi_uc *pcb, const stbi_uc *pcr, int count, - int step); - stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs); -} stbi__jpeg; - -static int stbi__build_huffman(stbi__huffman *h, int *count) { - int i, j, k = 0; - unsigned int code; - // build size list for each symbol (from JPEG spec) - for (i = 0; i < 16; ++i) - for (j = 0; j < count[i]; ++j) - h->size[k++] = (stbi_uc)(i + 1); - h->size[k] = 0; - - // compute actual symbols (from jpeg spec) - code = 0; - k = 0; - for (j = 1; j <= 16; ++j) { - // compute delta to add to code to compute symbol id - h->delta[j] = k - code; - if (h->size[k] == j) { - while (h->size[k] == j) - h->code[k++] = (stbi__uint16)(code++); - if (code - 1 >= (1u << j)) - return stbi__err("bad code lengths", "Corrupt JPEG"); - } - // compute largest code + 1 for this size, preshifted as needed later - h->maxcode[j] = code << (16 - j); - code <<= 1; - } - h->maxcode[j] = 0xffffffff; - - // build non-spec acceleration table; 255 is flag for not-accelerated - memset(h->fast, 255, 1 << FAST_BITS); - for (i = 0; i < k; ++i) { - int s = h->size[i]; - if (s <= FAST_BITS) { - int c = h->code[i] << (FAST_BITS - s); - int m = 1 << (FAST_BITS - s); - for (j = 0; j < m; ++j) { - h->fast[c + j] = (stbi_uc)i; - } - } - } - return 1; -} - -// build a table that decodes both magnitude and value of small ACs in -// one go. -static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h) { - int i; - for (i = 0; i < (1 << FAST_BITS); ++i) { - stbi_uc fast = h->fast[i]; - fast_ac[i] = 0; - if (fast < 255) { - int rs = h->values[fast]; - int run = (rs >> 4) & 15; - int magbits = rs & 15; - int len = h->size[fast]; - - if (magbits && len + magbits <= FAST_BITS) { - // magnitude code followed by receive_extend code - int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); - int m = 1 << (magbits - 1); - if (k < m) - k += (~0U << magbits) + 1; - // if the result is small enough, we can fit it in fast_ac table - if (k >= -128 && k <= 127) - fast_ac[i] = (stbi__int16)((k * 256) + (run * 16) + (len + magbits)); - } - } - } -} - -static void stbi__grow_buffer_unsafe(stbi__jpeg *j) { - do { - unsigned int b = j->nomore ? 0 : stbi__get8(j->s); - if (b == 0xff) { - int c = stbi__get8(j->s); - while (c == 0xff) - c = stbi__get8(j->s); // consume fill bytes - if (c != 0) { - j->marker = (unsigned char)c; - j->nomore = 1; - return; - } - } - j->code_buffer |= b << (24 - j->code_bits); - j->code_bits += 8; - } while (j->code_bits <= 24); -} - -// (1 << n) - 1 -static const stbi__uint32 stbi__bmask[17] = { - 0, 1, 3, 7, 15, 31, 63, 127, 255, - 511, 1023, 2047, 4095, 8191, 16383, 32767, 65535}; - -// decode a jpeg huffman value from the bitstream -stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) { - unsigned int temp; - int c, k; - - if (j->code_bits < 16) - stbi__grow_buffer_unsafe(j); - - // look at the top FAST_BITS and determine what symbol ID it is, - // if the code is <= FAST_BITS - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); - k = h->fast[c]; - if (k < 255) { - int s = h->size[k]; - if (s > j->code_bits) - return -1; - j->code_buffer <<= s; - j->code_bits -= s; - return h->values[k]; - } - - // naive test is to shift the code_buffer down so k bits are - // valid, then test against maxcode. To speed this up, we've - // preshifted maxcode left so that it has (16-k) 0s at the - // end; in other words, regardless of the number of bits, it - // wants to be compared against something shifted to have 16; - // that way we don't need to shift inside the loop. - temp = j->code_buffer >> 16; - for (k = FAST_BITS + 1;; ++k) - if (temp < h->maxcode[k]) - break; - if (k == 17) { - // error! code not found - j->code_bits -= 16; - return -1; - } - - if (k > j->code_bits) - return -1; - - // convert the huffman code to the symbol id - c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; - STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & - stbi__bmask[h->size[c]]) == h->code[c]); - - // convert the id to a symbol - j->code_bits -= k; - j->code_buffer <<= k; - return h->values[c]; -} - -// bias[n] = (-1<<n) + 1 -static const int stbi__jbias[16] = {0, -1, -3, -7, -15, -31, - -63, -127, -255, -511, -1023, -2047, - -4095, -8191, -16383, -32767}; - -// combined JPEG 'receive' and JPEG 'extend', since baseline -// always extends everything it receives. -stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n) { - unsigned int k; - int sgn; - if (j->code_bits < n) - stbi__grow_buffer_unsafe(j); - - sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB - k = stbi_lrot(j->code_buffer, n); - STBI_ASSERT(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask))); - j->code_buffer = k & ~stbi__bmask[n]; - k &= stbi__bmask[n]; - j->code_bits -= n; - return k + (stbi__jbias[n] & ~sgn); -} - -// get some unsigned bits -stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n) { - unsigned int k; - if (j->code_bits < n) - stbi__grow_buffer_unsafe(j); - k = stbi_lrot(j->code_buffer, n); - j->code_buffer = k & ~stbi__bmask[n]; - k &= stbi__bmask[n]; - j->code_bits -= n; - return k; -} - -stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j) { - unsigned int k; - if (j->code_bits < 1) - stbi__grow_buffer_unsafe(j); - k = j->code_buffer; - j->code_buffer <<= 1; - --j->code_bits; - return k & 0x80000000; -} - -// given a value that's at position X in the zigzag stream, -// where does it appear in the 8x8 matrix coded as row-major? -static const stbi_uc stbi__jpeg_dezigzag[64 + 15] = { - 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, - 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, - 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, - 54, 47, 55, 62, 63, - // let corrupt input sample past end - 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63}; - -// decode one 64-entry block-- -static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], - stbi__huffman *hdc, stbi__huffman *hac, - stbi__int16 *fac, int b, - stbi__uint16 *dequant) { - int diff, dc, k; - int t; - - if (j->code_bits < 16) - stbi__grow_buffer_unsafe(j); - t = stbi__jpeg_huff_decode(j, hdc); - if (t < 0) - return stbi__err("bad huffman code", "Corrupt JPEG"); - - // 0 all the ac values now so we can do it 32-bits at a time - memset(data, 0, 64 * sizeof(data[0])); - - diff = t ? stbi__extend_receive(j, t) : 0; - dc = j->img_comp[b].dc_pred + diff; - j->img_comp[b].dc_pred = dc; - data[0] = (short)(dc * dequant[0]); - - // decode AC components, see JPEG spec - k = 1; - do { - unsigned int zig; - int c, r, s; - if (j->code_bits < 16) - stbi__grow_buffer_unsafe(j); - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); - r = fac[c]; - if (r) { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length - j->code_buffer <<= s; - j->code_bits -= s; - // decode into unzigzag'd location - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short)((r >> 8) * dequant[zig]); - } else { - int rs = stbi__jpeg_huff_decode(j, hac); - if (rs < 0) - return stbi__err("bad huffman code", "Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (rs != 0xf0) - break; // end block - k += 16; - } else { - k += r; - // decode into unzigzag'd location - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short)(stbi__extend_receive(j, s) * dequant[zig]); - } - } - } while (k < 64); - return 1; -} - -static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], - stbi__huffman *hdc, int b) { - int diff, dc; - int t; - if (j->spec_end != 0) - return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - - if (j->code_bits < 16) - stbi__grow_buffer_unsafe(j); - - if (j->succ_high == 0) { - // first scan for DC coefficient, must be first - memset(data, 0, 64 * sizeof(data[0])); // 0 all the ac values now - t = stbi__jpeg_huff_decode(j, hdc); - diff = t ? stbi__extend_receive(j, t) : 0; - - dc = j->img_comp[b].dc_pred + diff; - j->img_comp[b].dc_pred = dc; - data[0] = (short)(dc << j->succ_low); - } else { - // refinement scan for DC coefficient - if (stbi__jpeg_get_bit(j)) - data[0] += (short)(1 << j->succ_low); - } - return 1; -} - -// @OPTIMIZE: store non-zigzagged during the decode passes, -// and only de-zigzag when dequantizing -static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], - stbi__huffman *hac, - stbi__int16 *fac) { - int k; - if (j->spec_start == 0) - return stbi__err("can't merge dc and ac", "Corrupt JPEG"); - - if (j->succ_high == 0) { - int shift = j->succ_low; - - if (j->eob_run) { - --j->eob_run; - return 1; - } - - k = j->spec_start; - do { - unsigned int zig; - int c, r, s; - if (j->code_bits < 16) - stbi__grow_buffer_unsafe(j); - c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS) - 1); - r = fac[c]; - if (r) { // fast-AC path - k += (r >> 4) & 15; // run - s = r & 15; // combined length - j->code_buffer <<= s; - j->code_bits -= s; - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short)((r >> 8) << shift); - } else { - int rs = stbi__jpeg_huff_decode(j, hac); - if (rs < 0) - return stbi__err("bad huffman code", "Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (r < 15) { - j->eob_run = (1 << r); - if (r) - j->eob_run += stbi__jpeg_get_bits(j, r); - --j->eob_run; - break; - } - k += 16; - } else { - k += r; - zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short)(stbi__extend_receive(j, s) << shift); - } - } - } while (k <= j->spec_end); - } else { - // refinement scan for these AC coefficients - - short bit = (short)(1 << j->succ_low); - - if (j->eob_run) { - --j->eob_run; - for (k = j->spec_start; k <= j->spec_end; ++k) { - short *p = &data[stbi__jpeg_dezigzag[k]]; - if (*p != 0) - if (stbi__jpeg_get_bit(j)) - if ((*p & bit) == 0) { - if (*p > 0) - *p += bit; - else - *p -= bit; - } - } - } else { - k = j->spec_start; - do { - int r, s; - int rs = stbi__jpeg_huff_decode( - j, hac); // @OPTIMIZE see if we can use the fast path here, - // advance-by-r is so slow, eh - if (rs < 0) - return stbi__err("bad huffman code", "Corrupt JPEG"); - s = rs & 15; - r = rs >> 4; - if (s == 0) { - if (r < 15) { - j->eob_run = (1 << r) - 1; - if (r) - j->eob_run += stbi__jpeg_get_bits(j, r); - r = 64; // force end of block - } else { - // r=15 s=0 should write 16 0s, so we just do - // a run of 15 0s and then write s (which is 0), - // so we don't have to do anything special here - } - } else { - if (s != 1) - return stbi__err("bad huffman code", "Corrupt JPEG"); - // sign bit - if (stbi__jpeg_get_bit(j)) - s = bit; - else - s = -bit; - } - - // advance by r - while (k <= j->spec_end) { - short *p = &data[stbi__jpeg_dezigzag[k++]]; - if (*p != 0) { - if (stbi__jpeg_get_bit(j)) - if ((*p & bit) == 0) { - if (*p > 0) - *p += bit; - else - *p -= bit; - } - } else { - if (r == 0) { - *p = (short)s; - break; - } - --r; - } - } - } while (k <= j->spec_end); - } - } - return 1; -} - -// take a -128..127 value and stbi__clamp it and convert to 0..255 -stbi_inline static stbi_uc stbi__clamp(int x) { - // trick to use a single test to catch both cases - if ((unsigned int)x > 255) { - if (x < 0) - return 0; - if (x > 255) - return 255; - } - return (stbi_uc)x; -} - -#define stbi__f2f(x) ((int)(((x)*4096 + 0.5))) -#define stbi__fsh(x) ((x)*4096) - -// derived from jidctint -- DCT_ISLOW -#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7) \ - int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \ - p2 = s2; \ - p3 = s6; \ - p1 = (p2 + p3) * stbi__f2f(0.5411961f); \ - t2 = p1 + p3 * stbi__f2f(-1.847759065f); \ - t3 = p1 + p2 * stbi__f2f(0.765366865f); \ - p2 = s0; \ - p3 = s4; \ - t0 = stbi__fsh(p2 + p3); \ - t1 = stbi__fsh(p2 - p3); \ - x0 = t0 + t3; \ - x3 = t0 - t3; \ - x1 = t1 + t2; \ - x2 = t1 - t2; \ - t0 = s7; \ - t1 = s5; \ - t2 = s3; \ - t3 = s1; \ - p3 = t0 + t2; \ - p4 = t1 + t3; \ - p1 = t0 + t3; \ - p2 = t1 + t2; \ - p5 = (p3 + p4) * stbi__f2f(1.175875602f); \ - t0 = t0 * stbi__f2f(0.298631336f); \ - t1 = t1 * stbi__f2f(2.053119869f); \ - t2 = t2 * stbi__f2f(3.072711026f); \ - t3 = t3 * stbi__f2f(1.501321110f); \ - p1 = p5 + p1 * stbi__f2f(-0.899976223f); \ - p2 = p5 + p2 * stbi__f2f(-2.562915447f); \ - p3 = p3 * stbi__f2f(-1.961570560f); \ - p4 = p4 * stbi__f2f(-0.390180644f); \ - t3 += p1 + p4; \ - t2 += p2 + p3; \ - t1 += p2 + p4; \ - t0 += p1 + p3; - -static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) { - int i, val[64], *v = val; - stbi_uc *o; - short *d = data; - - // columns - for (i = 0; i < 8; ++i, ++d, ++v) { - // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing - if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 && - d[48] == 0 && d[56] == 0) { - // no shortcut 0 seconds - // (1|2|3|4|5|6|7)==0 0 seconds - // all separate -0.047 seconds - // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds - int dcterm = d[0] * 4; - v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; - } else { - STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56]) - // constants scaled things up by 1<<12; let's bring them back - // down, but keep 2 extra bits of precision - x0 += 512; - x1 += 512; - x2 += 512; - x3 += 512; - v[0] = (x0 + t3) >> 10; - v[56] = (x0 - t3) >> 10; - v[8] = (x1 + t2) >> 10; - v[48] = (x1 - t2) >> 10; - v[16] = (x2 + t1) >> 10; - v[40] = (x2 - t1) >> 10; - v[24] = (x3 + t0) >> 10; - v[32] = (x3 - t0) >> 10; - } - } - - for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) { - // no fast case since the first 1D IDCT spread components out - STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]) - // constants scaled things up by 1<<12, plus we had 1<<2 from first - // loop, plus horizontal and vertical each scale by sqrt(8) so together - // we've got an extra 1<<3, so 1<<17 total we need to remove. - // so we want to round that, which means adding 0.5 * 1<<17, - // aka 65536. Also, we'll end up with -128 to 127 that we want - // to encode as 0..255 by adding 128, so we'll add that before the shift - x0 += 65536 + (128 << 17); - x1 += 65536 + (128 << 17); - x2 += 65536 + (128 << 17); - x3 += 65536 + (128 << 17); - // tried computing the shifts into temps, or'ing the temps to see - // if any were out of range, but that was slower - o[0] = stbi__clamp((x0 + t3) >> 17); - o[7] = stbi__clamp((x0 - t3) >> 17); - o[1] = stbi__clamp((x1 + t2) >> 17); - o[6] = stbi__clamp((x1 - t2) >> 17); - o[2] = stbi__clamp((x2 + t1) >> 17); - o[5] = stbi__clamp((x2 - t1) >> 17); - o[3] = stbi__clamp((x3 + t0) >> 17); - o[4] = stbi__clamp((x3 - t0) >> 17); - } -} - -#ifdef STBI_SSE2 -// sse2 integer IDCT. not the fastest possible implementation but it -// produces bit-identical results to the generic C version so it's -// fully "transparent". -static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) { - // This is constructed to match our regular (generic) integer IDCT exactly. - __m128i row0, row1, row2, row3, row4, row5, row6, row7; - __m128i tmp; - -// dot product constant: even elems=x, odd elems=y -#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y)) - -// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) -// out(1) = c1[even]*x + c1[odd]*y -#define dct_rot(out0, out1, x, y, c0, c1) \ - __m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \ - __m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \ - __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ - __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ - __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ - __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) - -// out = in << 12 (in 16-bit, out 32-bit) -#define dct_widen(out, in) \ - __m128i out##_l = \ - _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ - __m128i out##_h = \ - _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) - -// wide add -#define dct_wadd(out, a, b) \ - __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ - __m128i out##_h = _mm_add_epi32(a##_h, b##_h) - -// wide sub -#define dct_wsub(out, a, b) \ - __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ - __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) - -// butterfly a/b, add bias, then shift by "s" and pack -#define dct_bfly32o(out0, out1, a, b, bias, s) \ - { \ - __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ - __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ - dct_wadd(sum, abiased, b); \ - dct_wsub(dif, abiased, b); \ - out0 = \ - _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ - out1 = \ - _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ - } - -// 8-bit interleave step (for transposes) -#define dct_interleave8(a, b) \ - tmp = a; \ - a = _mm_unpacklo_epi8(a, b); \ - b = _mm_unpackhi_epi8(tmp, b) - -// 16-bit interleave step (for transposes) -#define dct_interleave16(a, b) \ - tmp = a; \ - a = _mm_unpacklo_epi16(a, b); \ - b = _mm_unpackhi_epi16(tmp, b) - -#define dct_pass(bias, shift) \ - { \ - /* even part */ \ - dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1); \ - __m128i sum04 = _mm_add_epi16(row0, row4); \ - __m128i dif04 = _mm_sub_epi16(row0, row4); \ - dct_widen(t0e, sum04); \ - dct_widen(t1e, dif04); \ - dct_wadd(x0, t0e, t3e); \ - dct_wsub(x3, t0e, t3e); \ - dct_wadd(x1, t1e, t2e); \ - dct_wsub(x2, t1e, t2e); \ - /* odd part */ \ - dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1); \ - dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1); \ - __m128i sum17 = _mm_add_epi16(row1, row7); \ - __m128i sum35 = _mm_add_epi16(row3, row5); \ - dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \ - dct_wadd(x4, y0o, y4o); \ - dct_wadd(x5, y1o, y5o); \ - dct_wadd(x6, y2o, y5o); \ - dct_wadd(x7, y3o, y4o); \ - dct_bfly32o(row0, row7, x0, x7, bias, shift); \ - dct_bfly32o(row1, row6, x1, x6, bias, shift); \ - dct_bfly32o(row2, row5, x2, x5, bias, shift); \ - dct_bfly32o(row3, row4, x3, x4, bias, shift); \ - } - - __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), - stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); - __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f), - stbi__f2f(0.5411961f)); - __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), - stbi__f2f(1.175875602f)); - __m128i rot1_1 = - dct_const(stbi__f2f(1.175875602f), - stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); - __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f), - stbi__f2f(-1.961570560f)); - __m128i rot2_1 = - dct_const(stbi__f2f(-1.961570560f), - stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f)); - __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f), - stbi__f2f(-0.390180644f)); - __m128i rot3_1 = - dct_const(stbi__f2f(-0.390180644f), - stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f)); - - // rounding biases in column/row passes, see stbi__idct_block for explanation. - __m128i bias_0 = _mm_set1_epi32(512); - __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17)); - - // load - row0 = _mm_load_si128((const __m128i *)(data + 0 * 8)); - row1 = _mm_load_si128((const __m128i *)(data + 1 * 8)); - row2 = _mm_load_si128((const __m128i *)(data + 2 * 8)); - row3 = _mm_load_si128((const __m128i *)(data + 3 * 8)); - row4 = _mm_load_si128((const __m128i *)(data + 4 * 8)); - row5 = _mm_load_si128((const __m128i *)(data + 5 * 8)); - row6 = _mm_load_si128((const __m128i *)(data + 6 * 8)); - row7 = _mm_load_si128((const __m128i *)(data + 7 * 8)); - - // column pass - dct_pass(bias_0, 10); - - { - // 16bit 8x8 transpose pass 1 - dct_interleave16(row0, row4); - dct_interleave16(row1, row5); - dct_interleave16(row2, row6); - dct_interleave16(row3, row7); - - // transpose pass 2 - dct_interleave16(row0, row2); - dct_interleave16(row1, row3); - dct_interleave16(row4, row6); - dct_interleave16(row5, row7); - - // transpose pass 3 - dct_interleave16(row0, row1); - dct_interleave16(row2, row3); - dct_interleave16(row4, row5); - dct_interleave16(row6, row7); - } - - // row pass - dct_pass(bias_1, 17); - - { - // pack - __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 - __m128i p1 = _mm_packus_epi16(row2, row3); - __m128i p2 = _mm_packus_epi16(row4, row5); - __m128i p3 = _mm_packus_epi16(row6, row7); - - // 8bit 8x8 transpose pass 1 - dct_interleave8(p0, p2); // a0e0a1e1... - dct_interleave8(p1, p3); // c0g0c1g1... - - // transpose pass 2 - dct_interleave8(p0, p1); // a0c0e0g0... - dct_interleave8(p2, p3); // b0d0f0h0... - - // transpose pass 3 - dct_interleave8(p0, p2); // a0b0c0d0... - dct_interleave8(p1, p3); // a4b4c4d4... - - // store - _mm_storel_epi64((__m128i *)out, p0); - out += out_stride; - _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e)); - out += out_stride; - _mm_storel_epi64((__m128i *)out, p2); - out += out_stride; - _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e)); - out += out_stride; - _mm_storel_epi64((__m128i *)out, p1); - out += out_stride; - _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e)); - out += out_stride; - _mm_storel_epi64((__m128i *)out, p3); - out += out_stride; - _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e)); - } - -#undef dct_const -#undef dct_rot -#undef dct_widen -#undef dct_wadd -#undef dct_wsub -#undef dct_bfly32o -#undef dct_interleave8 -#undef dct_interleave16 -#undef dct_pass -} - -#endif // STBI_SSE2 - -#ifdef STBI_NEON - -// NEON integer IDCT. should produce bit-identical -// results to the generic C version. -static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) { - int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; - - int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f)); - int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f)); - int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f)); - int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f)); - int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f)); - int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f)); - int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f)); - int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f)); - int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f)); - int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f)); - int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f)); - int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f)); - -#define dct_long_mul(out, inq, coeff) \ - int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ - int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) - -#define dct_long_mac(out, acc, inq, coeff) \ - int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ - int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) - -#define dct_widen(out, inq) \ - int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ - int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) - -// wide add -#define dct_wadd(out, a, b) \ - int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ - int32x4_t out##_h = vaddq_s32(a##_h, b##_h) - -// wide sub -#define dct_wsub(out, a, b) \ - int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ - int32x4_t out##_h = vsubq_s32(a##_h, b##_h) - -// butterfly a/b, then shift using "shiftop" by "s" and pack -#define dct_bfly32o(out0, out1, a, b, shiftop, s) \ - { \ - dct_wadd(sum, a, b); \ - dct_wsub(dif, a, b); \ - out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ - out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ - } - -#define dct_pass(shiftop, shift) \ - { \ - /* even part */ \ - int16x8_t sum26 = vaddq_s16(row2, row6); \ - dct_long_mul(p1e, sum26, rot0_0); \ - dct_long_mac(t2e, p1e, row6, rot0_1); \ - dct_long_mac(t3e, p1e, row2, rot0_2); \ - int16x8_t sum04 = vaddq_s16(row0, row4); \ - int16x8_t dif04 = vsubq_s16(row0, row4); \ - dct_widen(t0e, sum04); \ - dct_widen(t1e, dif04); \ - dct_wadd(x0, t0e, t3e); \ - dct_wsub(x3, t0e, t3e); \ - dct_wadd(x1, t1e, t2e); \ - dct_wsub(x2, t1e, t2e); \ - /* odd part */ \ - int16x8_t sum15 = vaddq_s16(row1, row5); \ - int16x8_t sum17 = vaddq_s16(row1, row7); \ - int16x8_t sum35 = vaddq_s16(row3, row5); \ - int16x8_t sum37 = vaddq_s16(row3, row7); \ - int16x8_t sumodd = vaddq_s16(sum17, sum35); \ - dct_long_mul(p5o, sumodd, rot1_0); \ - dct_long_mac(p1o, p5o, sum17, rot1_1); \ - dct_long_mac(p2o, p5o, sum35, rot1_2); \ - dct_long_mul(p3o, sum37, rot2_0); \ - dct_long_mul(p4o, sum15, rot2_1); \ - dct_wadd(sump13o, p1o, p3o); \ - dct_wadd(sump24o, p2o, p4o); \ - dct_wadd(sump23o, p2o, p3o); \ - dct_wadd(sump14o, p1o, p4o); \ - dct_long_mac(x4, sump13o, row7, rot3_0); \ - dct_long_mac(x5, sump24o, row5, rot3_1); \ - dct_long_mac(x6, sump23o, row3, rot3_2); \ - dct_long_mac(x7, sump14o, row1, rot3_3); \ - dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \ - dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \ - dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \ - dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \ - } - - // load - row0 = vld1q_s16(data + 0 * 8); - row1 = vld1q_s16(data + 1 * 8); - row2 = vld1q_s16(data + 2 * 8); - row3 = vld1q_s16(data + 3 * 8); - row4 = vld1q_s16(data + 4 * 8); - row5 = vld1q_s16(data + 5 * 8); - row6 = vld1q_s16(data + 6 * 8); - row7 = vld1q_s16(data + 7 * 8); - - // add DC bias - row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); - - // column pass - dct_pass(vrshrn_n_s32, 10); - - // 16bit 8x8 transpose - { -// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. -// whether compilers actually get this is another story, sadly. -#define dct_trn16(x, y) \ - { \ - int16x8x2_t t = vtrnq_s16(x, y); \ - x = t.val[0]; \ - y = t.val[1]; \ - } -#define dct_trn32(x, y) \ - { \ - int32x4x2_t t = \ - vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \ - x = vreinterpretq_s16_s32(t.val[0]); \ - y = vreinterpretq_s16_s32(t.val[1]); \ - } -#define dct_trn64(x, y) \ - { \ - int16x8_t x0 = x; \ - int16x8_t y0 = y; \ - x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); \ - y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \ - } - - // pass 1 - dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 - dct_trn16(row2, row3); - dct_trn16(row4, row5); - dct_trn16(row6, row7); - - // pass 2 - dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 - dct_trn32(row1, row3); - dct_trn32(row4, row6); - dct_trn32(row5, row7); - - // pass 3 - dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 - dct_trn64(row1, row5); - dct_trn64(row2, row6); - dct_trn64(row3, row7); - -#undef dct_trn16 -#undef dct_trn32 -#undef dct_trn64 - } - - // row pass - // vrshrn_n_s32 only supports shifts up to 16, we need - // 17. so do a non-rounding shift of 16 first then follow - // up with a rounding shift by 1. - dct_pass(vshrn_n_s32, 16); - - { - // pack and round - uint8x8_t p0 = vqrshrun_n_s16(row0, 1); - uint8x8_t p1 = vqrshrun_n_s16(row1, 1); - uint8x8_t p2 = vqrshrun_n_s16(row2, 1); - uint8x8_t p3 = vqrshrun_n_s16(row3, 1); - uint8x8_t p4 = vqrshrun_n_s16(row4, 1); - uint8x8_t p5 = vqrshrun_n_s16(row5, 1); - uint8x8_t p6 = vqrshrun_n_s16(row6, 1); - uint8x8_t p7 = vqrshrun_n_s16(row7, 1); - - // again, these can translate into one instruction, but often don't. -#define dct_trn8_8(x, y) \ - { \ - uint8x8x2_t t = vtrn_u8(x, y); \ - x = t.val[0]; \ - y = t.val[1]; \ - } -#define dct_trn8_16(x, y) \ - { \ - uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \ - x = vreinterpret_u8_u16(t.val[0]); \ - y = vreinterpret_u8_u16(t.val[1]); \ - } -#define dct_trn8_32(x, y) \ - { \ - uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \ - x = vreinterpret_u8_u32(t.val[0]); \ - y = vreinterpret_u8_u32(t.val[1]); \ - } - - // sadly can't use interleaved stores here since we only write - // 8 bytes to each scan line! - - // 8x8 8-bit transpose pass 1 - dct_trn8_8(p0, p1); - dct_trn8_8(p2, p3); - dct_trn8_8(p4, p5); - dct_trn8_8(p6, p7); - - // pass 2 - dct_trn8_16(p0, p2); - dct_trn8_16(p1, p3); - dct_trn8_16(p4, p6); - dct_trn8_16(p5, p7); - - // pass 3 - dct_trn8_32(p0, p4); - dct_trn8_32(p1, p5); - dct_trn8_32(p2, p6); - dct_trn8_32(p3, p7); - - // store - vst1_u8(out, p0); - out += out_stride; - vst1_u8(out, p1); - out += out_stride; - vst1_u8(out, p2); - out += out_stride; - vst1_u8(out, p3); - out += out_stride; - vst1_u8(out, p4); - out += out_stride; - vst1_u8(out, p5); - out += out_stride; - vst1_u8(out, p6); - out += out_stride; - vst1_u8(out, p7); - -#undef dct_trn8_8 -#undef dct_trn8_16 -#undef dct_trn8_32 - } - -#undef dct_long_mul -#undef dct_long_mac -#undef dct_widen -#undef dct_wadd -#undef dct_wsub -#undef dct_bfly32o -#undef dct_pass -} - -#endif // STBI_NEON - -#define STBI__MARKER_none 0xff -// if there's a pending marker from the entropy stream, return that -// otherwise, fetch from the stream and get a marker. if there's no -// marker, return 0xff, which is never a valid marker value -static stbi_uc stbi__get_marker(stbi__jpeg *j) { - stbi_uc x; - if (j->marker != STBI__MARKER_none) { - x = j->marker; - j->marker = STBI__MARKER_none; - return x; - } - x = stbi__get8(j->s); - if (x != 0xff) - return STBI__MARKER_none; - while (x == 0xff) - x = stbi__get8(j->s); // consume repeated 0xff fill bytes - return x; -} - -// in each scan, we'll have scan_n components, and the order -// of the components is specified by order[] -#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) - -// after a restart interval, stbi__jpeg_reset the entropy decoder and -// the dc prediction -static void stbi__jpeg_reset(stbi__jpeg *j) { - j->code_bits = 0; - j->code_buffer = 0; - j->nomore = 0; - j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = - j->img_comp[3].dc_pred = 0; - j->marker = STBI__MARKER_none; - j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; - j->eob_run = 0; - // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, - // since we don't even allow 1<<30 pixels -} - -static int stbi__parse_entropy_coded_data(stbi__jpeg *z) { - stbi__jpeg_reset(z); - if (!z->progressive) { - if (z->scan_n == 1) { - int i, j; - STBI_SIMD_ALIGN(short, data[64]); - int n = z->order[0]; - // non-interleaved data, we just need to process one block at a time, - // in trivial scanline order - // number of blocks to do just depends on how many actual "pixels" this - // component has, independent of interleaved MCU blocking and such - int w = (z->img_comp[n].x + 7) >> 3; - int h = (z->img_comp[n].y + 7) >> 3; - for (j = 0; j < h; ++j) { - for (i = 0; i < w; ++i) { - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block(z, data, z->huff_dc + z->img_comp[n].hd, - z->huff_ac + ha, z->fast_ac[ha], n, - z->dequant[z->img_comp[n].tq])) - return 0; - z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + - i * 8, - z->img_comp[n].w2, data); - // every data block is an MCU, so countdown the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) - stbi__grow_buffer_unsafe(z); - // if it's NOT a restart, then just bail, so we get corrupt data - // rather than no data - if (!STBI__RESTART(z->marker)) - return 1; - stbi__jpeg_reset(z); - } - } - } - return 1; - } else { // interleaved - int i, j, k, x, y; - STBI_SIMD_ALIGN(short, data[64]); - for (j = 0; j < z->img_mcu_y; ++j) { - for (i = 0; i < z->img_mcu_x; ++i) { - // scan an interleaved mcu... process scan_n components in order - for (k = 0; k < z->scan_n; ++k) { - int n = z->order[k]; - // scan out an mcu's worth of this component; that's just determined - // by the basic H and V specified for the component - for (y = 0; y < z->img_comp[n].v; ++y) { - for (x = 0; x < z->img_comp[n].h; ++x) { - int x2 = (i * z->img_comp[n].h + x) * 8; - int y2 = (j * z->img_comp[n].v + y) * 8; - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block(z, data, - z->huff_dc + z->img_comp[n].hd, - z->huff_ac + ha, z->fast_ac[ha], n, - z->dequant[z->img_comp[n].tq])) - return 0; - z->idct_block_kernel(z->img_comp[n].data + - z->img_comp[n].w2 * y2 + x2, - z->img_comp[n].w2, data); - } - } - } - // after all interleaved components, that's an interleaved MCU, - // so now count down the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) - stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) - return 1; - stbi__jpeg_reset(z); - } - } - } - return 1; - } - } else { - if (z->scan_n == 1) { - int i, j; - int n = z->order[0]; - // non-interleaved data, we just need to process one block at a time, - // in trivial scanline order - // number of blocks to do just depends on how many actual "pixels" this - // component has, independent of interleaved MCU blocking and such - int w = (z->img_comp[n].x + 7) >> 3; - int h = (z->img_comp[n].y + 7) >> 3; - for (j = 0; j < h; ++j) { - for (i = 0; i < w; ++i) { - short *data = - z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - if (z->spec_start == 0) { - if (!stbi__jpeg_decode_block_prog_dc( - z, data, &z->huff_dc[z->img_comp[n].hd], n)) - return 0; - } else { - int ha = z->img_comp[n].ha; - if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], - z->fast_ac[ha])) - return 0; - } - // every data block is an MCU, so countdown the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) - stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) - return 1; - stbi__jpeg_reset(z); - } - } - } - return 1; - } else { // interleaved - int i, j, k, x, y; - for (j = 0; j < z->img_mcu_y; ++j) { - for (i = 0; i < z->img_mcu_x; ++i) { - // scan an interleaved mcu... process scan_n components in order - for (k = 0; k < z->scan_n; ++k) { - int n = z->order[k]; - // scan out an mcu's worth of this component; that's just determined - // by the basic H and V specified for the component - for (y = 0; y < z->img_comp[n].v; ++y) { - for (x = 0; x < z->img_comp[n].h; ++x) { - int x2 = (i * z->img_comp[n].h + x); - int y2 = (j * z->img_comp[n].v + y); - short *data = z->img_comp[n].coeff + - 64 * (x2 + y2 * z->img_comp[n].coeff_w); - if (!stbi__jpeg_decode_block_prog_dc( - z, data, &z->huff_dc[z->img_comp[n].hd], n)) - return 0; - } - } - } - // after all interleaved components, that's an interleaved MCU, - // so now count down the restart interval - if (--z->todo <= 0) { - if (z->code_bits < 24) - stbi__grow_buffer_unsafe(z); - if (!STBI__RESTART(z->marker)) - return 1; - stbi__jpeg_reset(z); - } - } - } - return 1; - } - } -} - -static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant) { - int i; - for (i = 0; i < 64; ++i) - data[i] *= dequant[i]; -} - -static void stbi__jpeg_finish(stbi__jpeg *z) { - if (z->progressive) { - // dequantize and idct the data - int i, j, n; - for (n = 0; n < z->s->img_n; ++n) { - int w = (z->img_comp[n].x + 7) >> 3; - int h = (z->img_comp[n].y + 7) >> 3; - for (j = 0; j < h; ++j) { - for (i = 0; i < w; ++i) { - short *data = - z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); - stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); - z->idct_block_kernel(z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + - i * 8, - z->img_comp[n].w2, data); - } - } - } - } -} - -static int stbi__process_marker(stbi__jpeg *z, int m) { - int L; - switch (m) { - case STBI__MARKER_none: // no marker found - return stbi__err("expected marker", "Corrupt JPEG"); - - case 0xDD: // DRI - specify restart interval - if (stbi__get16be(z->s) != 4) - return stbi__err("bad DRI len", "Corrupt JPEG"); - z->restart_interval = stbi__get16be(z->s); - return 1; - - case 0xDB: // DQT - define quantization table - L = stbi__get16be(z->s) - 2; - while (L > 0) { - int q = stbi__get8(z->s); - int p = q >> 4, sixteen = (p != 0); - int t = q & 15, i; - if (p != 0 && p != 1) - return stbi__err("bad DQT type", "Corrupt JPEG"); - if (t > 3) - return stbi__err("bad DQT table", "Corrupt JPEG"); - - for (i = 0; i < 64; ++i) - z->dequant[t][stbi__jpeg_dezigzag[i]] = - (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s)); - L -= (sixteen ? 129 : 65); - } - return L == 0; - - case 0xC4: // DHT - define huffman table - L = stbi__get16be(z->s) - 2; - while (L > 0) { - stbi_uc *v; - int sizes[16], i, n = 0; - int q = stbi__get8(z->s); - int tc = q >> 4; - int th = q & 15; - if (tc > 1 || th > 3) - return stbi__err("bad DHT header", "Corrupt JPEG"); - for (i = 0; i < 16; ++i) { - sizes[i] = stbi__get8(z->s); - n += sizes[i]; - } - L -= 17; - if (tc == 0) { - if (!stbi__build_huffman(z->huff_dc + th, sizes)) - return 0; - v = z->huff_dc[th].values; - } else { - if (!stbi__build_huffman(z->huff_ac + th, sizes)) - return 0; - v = z->huff_ac[th].values; - } - for (i = 0; i < n; ++i) - v[i] = stbi__get8(z->s); - if (tc != 0) - stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); - L -= n; - } - return L == 0; - } - - // check for comment block or APP blocks - if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { - L = stbi__get16be(z->s); - if (L < 2) { - if (m == 0xFE) - return stbi__err("bad COM len", "Corrupt JPEG"); - else - return stbi__err("bad APP len", "Corrupt JPEG"); - } - L -= 2; - - if (m == 0xE0 && L >= 5) { // JFIF APP0 segment - static const unsigned char tag[5] = {'J', 'F', 'I', 'F', '\0'}; - int ok = 1; - int i; - for (i = 0; i < 5; ++i) - if (stbi__get8(z->s) != tag[i]) - ok = 0; - L -= 5; - if (ok) - z->jfif = 1; - } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment - static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'}; - int ok = 1; - int i; - for (i = 0; i < 6; ++i) - if (stbi__get8(z->s) != tag[i]) - ok = 0; - L -= 6; - if (ok) { - stbi__get8(z->s); // version - stbi__get16be(z->s); // flags0 - stbi__get16be(z->s); // flags1 - z->app14_color_transform = stbi__get8(z->s); // color transform - L -= 6; - } - } - - stbi__skip(z->s, L); - return 1; - } - - return stbi__err("unknown marker", "Corrupt JPEG"); -} - -// after we see SOS -static int stbi__process_scan_header(stbi__jpeg *z) { - int i; - int Ls = stbi__get16be(z->s); - z->scan_n = stbi__get8(z->s); - if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int)z->s->img_n) - return stbi__err("bad SOS component count", "Corrupt JPEG"); - if (Ls != 6 + 2 * z->scan_n) - return stbi__err("bad SOS len", "Corrupt JPEG"); - for (i = 0; i < z->scan_n; ++i) { - int id = stbi__get8(z->s), which; - int q = stbi__get8(z->s); - for (which = 0; which < z->s->img_n; ++which) - if (z->img_comp[which].id == id) - break; - if (which == z->s->img_n) - return 0; // no match - z->img_comp[which].hd = q >> 4; - if (z->img_comp[which].hd > 3) - return stbi__err("bad DC huff", "Corrupt JPEG"); - z->img_comp[which].ha = q & 15; - if (z->img_comp[which].ha > 3) - return stbi__err("bad AC huff", "Corrupt JPEG"); - z->order[i] = which; - } - - { - int aa; - z->spec_start = stbi__get8(z->s); - z->spec_end = stbi__get8(z->s); // should be 63, but might be 0 - aa = stbi__get8(z->s); - z->succ_high = (aa >> 4); - z->succ_low = (aa & 15); - if (z->progressive) { - if (z->spec_start > 63 || z->spec_end > 63 || - z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) - return stbi__err("bad SOS", "Corrupt JPEG"); - } else { - if (z->spec_start != 0) - return stbi__err("bad SOS", "Corrupt JPEG"); - if (z->succ_high != 0 || z->succ_low != 0) - return stbi__err("bad SOS", "Corrupt JPEG"); - z->spec_end = 63; - } - } - - return 1; -} - -static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why) { - int i; - for (i = 0; i < ncomp; ++i) { - if (z->img_comp[i].raw_data) { - STBI_FREE(z->img_comp[i].raw_data); - z->img_comp[i].raw_data = NULL; - z->img_comp[i].data = NULL; - } - if (z->img_comp[i].raw_coeff) { - STBI_FREE(z->img_comp[i].raw_coeff); - z->img_comp[i].raw_coeff = 0; - z->img_comp[i].coeff = 0; - } - if (z->img_comp[i].linebuf) { - STBI_FREE(z->img_comp[i].linebuf); - z->img_comp[i].linebuf = NULL; - } - } - return why; -} - -static int stbi__process_frame_header(stbi__jpeg *z, int scan) { - stbi__context *s = z->s; - int Lf, p, i, q, h_max = 1, v_max = 1, c; - Lf = stbi__get16be(s); - if (Lf < 11) - return stbi__err("bad SOF len", "Corrupt JPEG"); // JPEG - p = stbi__get8(s); - if (p != 8) - return stbi__err("only 8-bit", - "JPEG format not supported: 8-bit only"); // JPEG baseline - s->img_y = stbi__get16be(s); - if (s->img_y == 0) - return stbi__err( - "no header height", - "JPEG format not supported: delayed height"); // Legal, but we don't - // handle it--but neither - // does IJG - s->img_x = stbi__get16be(s); - if (s->img_x == 0) - return stbi__err("0 width", "Corrupt JPEG"); // JPEG requires - c = stbi__get8(s); - if (c != 3 && c != 1 && c != 4) - return stbi__err("bad component count", "Corrupt JPEG"); - s->img_n = c; - for (i = 0; i < c; ++i) { - z->img_comp[i].data = NULL; - z->img_comp[i].linebuf = NULL; - } - - if (Lf != 8 + 3 * s->img_n) - return stbi__err("bad SOF len", "Corrupt JPEG"); - - z->rgb = 0; - for (i = 0; i < s->img_n; ++i) { - static const unsigned char rgb[3] = {'R', 'G', 'B'}; - z->img_comp[i].id = stbi__get8(s); - if (s->img_n == 3 && z->img_comp[i].id == rgb[i]) - ++z->rgb; - q = stbi__get8(s); - z->img_comp[i].h = (q >> 4); - if (!z->img_comp[i].h || z->img_comp[i].h > 4) - return stbi__err("bad H", "Corrupt JPEG"); - z->img_comp[i].v = q & 15; - if (!z->img_comp[i].v || z->img_comp[i].v > 4) - return stbi__err("bad V", "Corrupt JPEG"); - z->img_comp[i].tq = stbi__get8(s); - if (z->img_comp[i].tq > 3) - return stbi__err("bad TQ", "Corrupt JPEG"); - } - - if (scan != STBI__SCAN_load) - return 1; - - if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) - return stbi__err("too large", "Image too large to decode"); - - for (i = 0; i < s->img_n; ++i) { - if (z->img_comp[i].h > h_max) - h_max = z->img_comp[i].h; - if (z->img_comp[i].v > v_max) - v_max = z->img_comp[i].v; - } - - // compute interleaved mcu info - z->img_h_max = h_max; - z->img_v_max = v_max; - z->img_mcu_w = h_max * 8; - z->img_mcu_h = v_max * 8; - // these sizes can't be more than 17 bits - z->img_mcu_x = (s->img_x + z->img_mcu_w - 1) / z->img_mcu_w; - z->img_mcu_y = (s->img_y + z->img_mcu_h - 1) / z->img_mcu_h; - - for (i = 0; i < s->img_n; ++i) { - // number of effective pixels (e.g. for non-interleaved MCU) - z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max - 1) / h_max; - z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max - 1) / v_max; - // to simplify generation, we'll allocate enough memory to decode - // the bogus oversized data from using interleaved MCUs and their - // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't - // discard the extra data until colorspace conversion - // - // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked - // earlier) so these muls can't overflow with 32-bit ints (which we require) - z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; - z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; - z->img_comp[i].coeff = 0; - z->img_comp[i].raw_coeff = 0; - z->img_comp[i].linebuf = NULL; - z->img_comp[i].raw_data = - stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15); - if (z->img_comp[i].raw_data == NULL) - return stbi__free_jpeg_components(z, i + 1, - stbi__err("outofmem", "Out of memory")); - // align blocks for idct using mmx/sse - z->img_comp[i].data = - (stbi_uc *)(((size_t)z->img_comp[i].raw_data + 15) & ~15); - if (z->progressive) { - // w2, h2 are multiples of 8 (see above) - z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8; - z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8; - z->img_comp[i].raw_coeff = stbi__malloc_mad3( - z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15); - if (z->img_comp[i].raw_coeff == NULL) - return stbi__free_jpeg_components( - z, i + 1, stbi__err("outofmem", "Out of memory")); - z->img_comp[i].coeff = - (short *)(((size_t)z->img_comp[i].raw_coeff + 15) & ~15); - } - } - - return 1; -} - -// use comparisons since in some cases we handle more than one case (e.g. SOF) -#define stbi__DNL(x) ((x) == 0xdc) -#define stbi__SOI(x) ((x) == 0xd8) -#define stbi__EOI(x) ((x) == 0xd9) -#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) -#define stbi__SOS(x) ((x) == 0xda) - -#define stbi__SOF_progressive(x) ((x) == 0xc2) - -static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) { - int m; - z->jfif = 0; - z->app14_color_transform = -1; // valid values are 0,1,2 - z->marker = STBI__MARKER_none; // initialize cached marker to empty - m = stbi__get_marker(z); - if (!stbi__SOI(m)) - return stbi__err("no SOI", "Corrupt JPEG"); - if (scan == STBI__SCAN_type) - return 1; - m = stbi__get_marker(z); - while (!stbi__SOF(m)) { - if (!stbi__process_marker(z, m)) - return 0; - m = stbi__get_marker(z); - while (m == STBI__MARKER_none) { - // some files have extra padding after their blocks, so ok, we'll scan - if (stbi__at_eof(z->s)) - return stbi__err("no SOF", "Corrupt JPEG"); - m = stbi__get_marker(z); - } - } - z->progressive = stbi__SOF_progressive(m); - if (!stbi__process_frame_header(z, scan)) - return 0; - return 1; -} - -// decode image to YCbCr format -static int stbi__decode_jpeg_image(stbi__jpeg *j) { - int m; - for (m = 0; m < 4; m++) { - j->img_comp[m].raw_data = NULL; - j->img_comp[m].raw_coeff = NULL; - } - j->restart_interval = 0; - if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) - return 0; - m = stbi__get_marker(j); - while (!stbi__EOI(m)) { - if (stbi__SOS(m)) { - if (!stbi__process_scan_header(j)) - return 0; - if (!stbi__parse_entropy_coded_data(j)) - return 0; - if (j->marker == STBI__MARKER_none) { - // handle 0s at the end of image data from IP Kamera 9060 - while (!stbi__at_eof(j->s)) { - int x = stbi__get8(j->s); - if (x == 255) { - j->marker = stbi__get8(j->s); - break; - } - } - // if we reach eof without hitting a marker, stbi__get_marker() below - // will fail and we'll eventually return 0 - } - } else if (stbi__DNL(m)) { - int Ld = stbi__get16be(j->s); - stbi__uint32 NL = stbi__get16be(j->s); - if (Ld != 4) - return stbi__err("bad DNL len", "Corrupt JPEG"); - if (NL != j->s->img_y) - return stbi__err("bad DNL height", "Corrupt JPEG"); - } else { - if (!stbi__process_marker(j, m)) - return 0; - } - m = stbi__get_marker(j); - } - if (j->progressive) - stbi__jpeg_finish(j); - return 1; -} - -// static jfif-centered resampling (across block boundaries) - -typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1, - int w, int hs); - -#define stbi__div4(x) ((stbi_uc)((x) >> 2)) - -static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, - int w, int hs) { - STBI_NOTUSED(out); - STBI_NOTUSED(in_far); - STBI_NOTUSED(w); - STBI_NOTUSED(hs); - return in_near; -} - -static stbi_uc *stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs) { - // need to generate two samples vertically for every one in input - int i; - STBI_NOTUSED(hs); - for (i = 0; i < w; ++i) - out[i] = stbi__div4(3 * in_near[i] + in_far[i] + 2); - return out; -} - -static stbi_uc *stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs) { - // need to generate two samples horizontally for every one in input - int i; - stbi_uc *input = in_near; - - if (w == 1) { - // if only one sample, can't do any interpolation - out[0] = out[1] = input[0]; - return out; - } - - out[0] = input[0]; - out[1] = stbi__div4(input[0] * 3 + input[1] + 2); - for (i = 1; i < w - 1; ++i) { - int n = 3 * input[i] + 2; - out[i * 2 + 0] = stbi__div4(n + input[i - 1]); - out[i * 2 + 1] = stbi__div4(n + input[i + 1]); - } - out[i * 2 + 0] = stbi__div4(input[w - 2] * 3 + input[w - 1] + 2); - out[i * 2 + 1] = input[w - 1]; - - STBI_NOTUSED(in_far); - STBI_NOTUSED(hs); - - return out; -} - -#define stbi__div16(x) ((stbi_uc)((x) >> 4)) - -static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs) { - // need to generate 2x2 samples for every one in input - int i, t0, t1; - if (w == 1) { - out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); - return out; - } - - t1 = 3 * in_near[0] + in_far[0]; - out[0] = stbi__div4(t1 + 2); - for (i = 1; i < w; ++i) { - t0 = t1; - t1 = 3 * in_near[i] + in_far[i]; - out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8); - out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - } - out[w * 2 - 1] = stbi__div4(t1 + 2); - - STBI_NOTUSED(hs); - - return out; -} - -#if defined(STBI_SSE2) || defined(STBI_NEON) -static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs) { - // need to generate 2x2 samples for every one in input - int i = 0, t0, t1; - - if (w == 1) { - out[0] = out[1] = stbi__div4(3 * in_near[0] + in_far[0] + 2); - return out; - } - - t1 = 3 * in_near[0] + in_far[0]; - // process groups of 8 pixels for as long as we can. - // note we can't handle the last pixel in a row in this loop - // because we need to handle the filter boundary conditions. - for (; i < ((w - 1) & ~7); i += 8) { -#if defined(STBI_SSE2) - // load and perform the vertical filtering pass - // this uses 3*x + y = 4*x + (y - x) - __m128i zero = _mm_setzero_si128(); - __m128i farb = _mm_loadl_epi64((__m128i *)(in_far + i)); - __m128i nearb = _mm_loadl_epi64((__m128i *)(in_near + i)); - __m128i farw = _mm_unpacklo_epi8(farb, zero); - __m128i nearw = _mm_unpacklo_epi8(nearb, zero); - __m128i diff = _mm_sub_epi16(farw, nearw); - __m128i nears = _mm_slli_epi16(nearw, 2); - __m128i curr = _mm_add_epi16(nears, diff); // current row - - // horizontal filter works the same based on shifted vers of current - // row. "prev" is current row shifted right by 1 pixel; we need to - // insert the previous pixel value (from t1). - // "next" is current row shifted left by 1 pixel, with first pixel - // of next block of 8 pixels added in. - __m128i prv0 = _mm_slli_si128(curr, 2); - __m128i nxt0 = _mm_srli_si128(curr, 2); - __m128i prev = _mm_insert_epi16(prv0, t1, 0); - __m128i next = - _mm_insert_epi16(nxt0, 3 * in_near[i + 8] + in_far[i + 8], 7); - - // horizontal filter, polyphase implementation since it's convenient: - // even pixels = 3*cur + prev = cur*4 + (prev - cur) - // odd pixels = 3*cur + next = cur*4 + (next - cur) - // note the shared term. - __m128i bias = _mm_set1_epi16(8); - __m128i curs = _mm_slli_epi16(curr, 2); - __m128i prvd = _mm_sub_epi16(prev, curr); - __m128i nxtd = _mm_sub_epi16(next, curr); - __m128i curb = _mm_add_epi16(curs, bias); - __m128i even = _mm_add_epi16(prvd, curb); - __m128i odd = _mm_add_epi16(nxtd, curb); - - // interleave even and odd pixels, then undo scaling. - __m128i int0 = _mm_unpacklo_epi16(even, odd); - __m128i int1 = _mm_unpackhi_epi16(even, odd); - __m128i de0 = _mm_srli_epi16(int0, 4); - __m128i de1 = _mm_srli_epi16(int1, 4); - - // pack and write output - __m128i outv = _mm_packus_epi16(de0, de1); - _mm_storeu_si128((__m128i *)(out + i * 2), outv); -#elif defined(STBI_NEON) - // load and perform the vertical filtering pass - // this uses 3*x + y = 4*x + (y - x) - uint8x8_t farb = vld1_u8(in_far + i); - uint8x8_t nearb = vld1_u8(in_near + i); - int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); - int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); - int16x8_t curr = vaddq_s16(nears, diff); // current row - - // horizontal filter works the same based on shifted vers of current - // row. "prev" is current row shifted right by 1 pixel; we need to - // insert the previous pixel value (from t1). - // "next" is current row shifted left by 1 pixel, with first pixel - // of next block of 8 pixels added in. - int16x8_t prv0 = vextq_s16(curr, curr, 7); - int16x8_t nxt0 = vextq_s16(curr, curr, 1); - int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); - int16x8_t next = - vsetq_lane_s16(3 * in_near[i + 8] + in_far[i + 8], nxt0, 7); - - // horizontal filter, polyphase implementation since it's convenient: - // even pixels = 3*cur + prev = cur*4 + (prev - cur) - // odd pixels = 3*cur + next = cur*4 + (next - cur) - // note the shared term. - int16x8_t curs = vshlq_n_s16(curr, 2); - int16x8_t prvd = vsubq_s16(prev, curr); - int16x8_t nxtd = vsubq_s16(next, curr); - int16x8_t even = vaddq_s16(curs, prvd); - int16x8_t odd = vaddq_s16(curs, nxtd); - - // undo scaling and round, then store with even/odd phases interleaved - uint8x8x2_t o; - o.val[0] = vqrshrun_n_s16(even, 4); - o.val[1] = vqrshrun_n_s16(odd, 4); - vst2_u8(out + i * 2, o); -#endif - - // "previous" value for next iter - t1 = 3 * in_near[i + 7] + in_far[i + 7]; - } - - t0 = t1; - t1 = 3 * in_near[i] + in_far[i]; - out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - - for (++i; i < w; ++i) { - t0 = t1; - t1 = 3 * in_near[i] + in_far[i]; - out[i * 2 - 1] = stbi__div16(3 * t0 + t1 + 8); - out[i * 2] = stbi__div16(3 * t1 + t0 + 8); - } - out[w * 2 - 1] = stbi__div4(t1 + 2); - - STBI_NOTUSED(hs); - - return out; -} -#endif - -static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, - stbi_uc *in_far, int w, int hs) { - // resample with nearest-neighbor - int i, j; - STBI_NOTUSED(in_far); - for (i = 0; i < w; ++i) - for (j = 0; j < hs; ++j) - out[i * hs + j] = in_near[i]; - return out; -} - -// this is a reduced-precision calculation of YCbCr-to-RGB introduced -// to make sure the code produces the same results in both SIMD and scalar -#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8) -static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, - const stbi_uc *pcb, const stbi_uc *pcr, - int count, int step) { - int i; - for (i = 0; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding - int r, g, b; - int cr = pcr[i] - 128; - int cb = pcb[i] - 128; - r = y_fixed + cr * stbi__float2fixed(1.40200f); - g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) + - ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000); - b = y_fixed + cb * stbi__float2fixed(1.77200f); - r >>= 20; - g >>= 20; - b >>= 20; - if ((unsigned)r > 255) { - if (r < 0) - r = 0; - else - r = 255; - } - if ((unsigned)g > 255) { - if (g < 0) - g = 0; - else - g = 255; - } - if ((unsigned)b > 255) { - if (b < 0) - b = 0; - else - b = 255; - } - out[0] = (stbi_uc)r; - out[1] = (stbi_uc)g; - out[2] = (stbi_uc)b; - out[3] = 255; - out += step; - } -} - -#if defined(STBI_SSE2) || defined(STBI_NEON) -static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, - stbi_uc const *pcb, stbi_uc const *pcr, - int count, int step) { - int i = 0; - -#ifdef STBI_SSE2 - // step == 3 is pretty ugly on the final interleave, and i'm not convinced - // it's useful in practice (you wouldn't use it for textures, for example). - // so just accelerate step == 4 case. - if (step == 4) { - // this is a fairly straightforward implementation and not super-optimized. - __m128i signflip = _mm_set1_epi8(-0x80); - __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f)); - __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f)); - __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f)); - __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f)); - __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128); - __m128i xw = _mm_set1_epi16(255); // alpha channel - - for (; i + 7 < count; i += 8) { - // load - __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i)); - __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i)); - __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i)); - __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 - __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 - - // unpack to short (and left-shift cr, cb by 8) - __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); - __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); - __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); - - // color transform - __m128i yws = _mm_srli_epi16(yw, 4); - __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); - __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); - __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); - __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); - __m128i rws = _mm_add_epi16(cr0, yws); - __m128i gwt = _mm_add_epi16(cb0, yws); - __m128i bws = _mm_add_epi16(yws, cb1); - __m128i gws = _mm_add_epi16(gwt, cr1); - - // descale - __m128i rw = _mm_srai_epi16(rws, 4); - __m128i bw = _mm_srai_epi16(bws, 4); - __m128i gw = _mm_srai_epi16(gws, 4); - - // back to byte, set up for transpose - __m128i brb = _mm_packus_epi16(rw, bw); - __m128i gxb = _mm_packus_epi16(gw, xw); - - // transpose to interleave channels - __m128i t0 = _mm_unpacklo_epi8(brb, gxb); - __m128i t1 = _mm_unpackhi_epi8(brb, gxb); - __m128i o0 = _mm_unpacklo_epi16(t0, t1); - __m128i o1 = _mm_unpackhi_epi16(t0, t1); - - // store - _mm_storeu_si128((__m128i *)(out + 0), o0); - _mm_storeu_si128((__m128i *)(out + 16), o1); - out += 32; - } - } -#endif - -#ifdef STBI_NEON - // in this version, step=3 support would be easy to add. but is there demand? - if (step == 4) { - // this is a fairly straightforward implementation and not super-optimized. - uint8x8_t signflip = vdup_n_u8(0x80); - int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f)); - int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f)); - int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f)); - int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f)); - - for (; i + 7 < count; i += 8) { - // load - uint8x8_t y_bytes = vld1_u8(y + i); - uint8x8_t cr_bytes = vld1_u8(pcr + i); - uint8x8_t cb_bytes = vld1_u8(pcb + i); - int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); - int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); - - // expand to s16 - int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); - int16x8_t crw = vshll_n_s8(cr_biased, 7); - int16x8_t cbw = vshll_n_s8(cb_biased, 7); - - // color transform - int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); - int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); - int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); - int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); - int16x8_t rws = vaddq_s16(yws, cr0); - int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); - int16x8_t bws = vaddq_s16(yws, cb1); - - // undo scaling, round, convert to byte - uint8x8x4_t o; - o.val[0] = vqrshrun_n_s16(rws, 4); - o.val[1] = vqrshrun_n_s16(gws, 4); - o.val[2] = vqrshrun_n_s16(bws, 4); - o.val[3] = vdup_n_u8(255); - - // store, interleaving r/g/b/a - vst4_u8(out, o); - out += 8 * 4; - } - } -#endif - - for (; i < count; ++i) { - int y_fixed = (y[i] << 20) + (1 << 19); // rounding - int r, g, b; - int cr = pcr[i] - 128; - int cb = pcb[i] - 128; - r = y_fixed + cr * stbi__float2fixed(1.40200f); - g = y_fixed + cr * -stbi__float2fixed(0.71414f) + - ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000); - b = y_fixed + cb * stbi__float2fixed(1.77200f); - r >>= 20; - g >>= 20; - b >>= 20; - if ((unsigned)r > 255) { - if (r < 0) - r = 0; - else - r = 255; - } - if ((unsigned)g > 255) { - if (g < 0) - g = 0; - else - g = 255; - } - if ((unsigned)b > 255) { - if (b < 0) - b = 0; - else - b = 255; - } - out[0] = (stbi_uc)r; - out[1] = (stbi_uc)g; - out[2] = (stbi_uc)b; - out[3] = 255; - out += step; - } -} -#endif - -// set up the kernels -static void stbi__setup_jpeg(stbi__jpeg *j) { - j->idct_block_kernel = stbi__idct_block; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; - -#ifdef STBI_SSE2 - if (stbi__sse2_available()) { - j->idct_block_kernel = stbi__idct_simd; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; - } -#endif - -#ifdef STBI_NEON - j->idct_block_kernel = stbi__idct_simd; - j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; - j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; -#endif -} - -// clean up the temporary component buffers -static void stbi__cleanup_jpeg(stbi__jpeg *j) { - stbi__free_jpeg_components(j, j->s->img_n, 0); -} - -typedef struct { - resample_row_func resample; - stbi_uc *line0, *line1; - int hs, vs; // expansion factor in each axis - int w_lores; // horizontal pixels pre-expansion - int ystep; // how far through vertical expansion we are - int ypos; // which pre-expansion row we're on -} stbi__resample; - -// fast 0..255 * 0..255 => 0..255 rounded multiplication -static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y) { - unsigned int t = x * y + 128; - return (stbi_uc)((t + (t >> 8)) >> 8); -} - -static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, - int *comp, int req_comp) { - int n, decode_n, is_rgb; - z->s->img_n = 0; // make stbi__cleanup_jpeg safe - - // validate req_comp - if (req_comp < 0 || req_comp > 4) - return stbi__errpuc("bad req_comp", "Internal error"); - - // load a jpeg image from whichever source, but leave in YCbCr format - if (!stbi__decode_jpeg_image(z)) { - stbi__cleanup_jpeg(z); - return NULL; - } - - // determine actual number of components to generate - n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1; - - is_rgb = z->s->img_n == 3 && - (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif)); - - if (z->s->img_n == 3 && n < 3 && !is_rgb) - decode_n = 1; - else - decode_n = z->s->img_n; - - // resample and color-convert - { - int k; - unsigned int i, j; - stbi_uc *output; - stbi_uc *coutput[4] = {NULL, NULL, NULL, NULL}; - - stbi__resample res_comp[4]; - - for (k = 0; k < decode_n; ++k) { - stbi__resample *r = &res_comp[k]; - - // allocate line buffer big enough for upsampling off the edges - // with upsample factor of 4 - z->img_comp[k].linebuf = (stbi_uc *)stbi__malloc(z->s->img_x + 3); - if (!z->img_comp[k].linebuf) { - stbi__cleanup_jpeg(z); - return stbi__errpuc("outofmem", "Out of memory"); - } - - r->hs = z->img_h_max / z->img_comp[k].h; - r->vs = z->img_v_max / z->img_comp[k].v; - r->ystep = r->vs >> 1; - r->w_lores = (z->s->img_x + r->hs - 1) / r->hs; - r->ypos = 0; - r->line0 = r->line1 = z->img_comp[k].data; - - if (r->hs == 1 && r->vs == 1) - r->resample = resample_row_1; - else if (r->hs == 1 && r->vs == 2) - r->resample = stbi__resample_row_v_2; - else if (r->hs == 2 && r->vs == 1) - r->resample = stbi__resample_row_h_2; - else if (r->hs == 2 && r->vs == 2) - r->resample = z->resample_row_hv_2_kernel; - else - r->resample = stbi__resample_row_generic; - } - - // can't error after this so, this is safe - output = (stbi_uc *)stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1); - if (!output) { - stbi__cleanup_jpeg(z); - return stbi__errpuc("outofmem", "Out of memory"); - } - - // now go ahead and resample - for (j = 0; j < z->s->img_y; ++j) { - stbi_uc *out = output + n * z->s->img_x * j; - for (k = 0; k < decode_n; ++k) { - stbi__resample *r = &res_comp[k]; - int y_bot = r->ystep >= (r->vs >> 1); - coutput[k] = - r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, - y_bot ? r->line0 : r->line1, r->w_lores, r->hs); - if (++r->ystep >= r->vs) { - r->ystep = 0; - r->line0 = r->line1; - if (++r->ypos < z->img_comp[k].y) - r->line1 += z->img_comp[k].w2; - } - } - if (n >= 3) { - stbi_uc *y = coutput[0]; - if (z->s->img_n == 3) { - if (is_rgb) { - for (i = 0; i < z->s->img_x; ++i) { - out[0] = y[i]; - out[1] = coutput[1][i]; - out[2] = coutput[2][i]; - out[3] = 255; - out += n; - } - } else { - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, - n); - } - } else if (z->s->img_n == 4) { - if (z->app14_color_transform == 0) { // CMYK - for (i = 0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - out[0] = stbi__blinn_8x8(coutput[0][i], m); - out[1] = stbi__blinn_8x8(coutput[1][i], m); - out[2] = stbi__blinn_8x8(coutput[2][i], m); - out[3] = 255; - out += n; - } - } else if (z->app14_color_transform == 2) { // YCCK - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, - n); - for (i = 0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - out[0] = stbi__blinn_8x8(255 - out[0], m); - out[1] = stbi__blinn_8x8(255 - out[1], m); - out[2] = stbi__blinn_8x8(255 - out[2], m); - out += n; - } - } else { // YCbCr + alpha? Ignore the fourth channel for now - z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, - n); - } - } else - for (i = 0; i < z->s->img_x; ++i) { - out[0] = out[1] = out[2] = y[i]; - out[3] = 255; // not used if n==3 - out += n; - } - } else { - if (is_rgb) { - if (n == 1) - for (i = 0; i < z->s->img_x; ++i) - *out++ = - stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); - else { - for (i = 0; i < z->s->img_x; ++i, out += 2) { - out[0] = - stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]); - out[1] = 255; - } - } - } else if (z->s->img_n == 4 && z->app14_color_transform == 0) { - for (i = 0; i < z->s->img_x; ++i) { - stbi_uc m = coutput[3][i]; - stbi_uc r = stbi__blinn_8x8(coutput[0][i], m); - stbi_uc g = stbi__blinn_8x8(coutput[1][i], m); - stbi_uc b = stbi__blinn_8x8(coutput[2][i], m); - out[0] = stbi__compute_y(r, g, b); - out[1] = 255; - out += n; - } - } else if (z->s->img_n == 4 && z->app14_color_transform == 2) { - for (i = 0; i < z->s->img_x; ++i) { - out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]); - out[1] = 255; - out += n; - } - } else { - stbi_uc *y = coutput[0]; - if (n == 1) - for (i = 0; i < z->s->img_x; ++i) - out[i] = y[i]; - else - for (i = 0; i < z->s->img_x; ++i) { - *out++ = y[i]; - *out++ = 255; - } - } - } - } - stbi__cleanup_jpeg(z); - *out_x = z->s->img_x; - *out_y = z->s->img_y; - if (comp) - *comp = - z->s->img_n >= 3 ? 3 : 1; // report original components, not output - return output; - } -} - -static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - unsigned char *result; - stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg)); - STBI_NOTUSED(ri); - j->s = s; - stbi__setup_jpeg(j); - result = load_jpeg_image(j, x, y, comp, req_comp); - STBI_FREE(j); - return result; -} - -static int stbi__jpeg_test(stbi__context *s) { - int r; - stbi__jpeg *j = (stbi__jpeg *)stbi__malloc(sizeof(stbi__jpeg)); - j->s = s; - stbi__setup_jpeg(j); - r = stbi__decode_jpeg_header(j, STBI__SCAN_type); - stbi__rewind(s); - STBI_FREE(j); - return r; -} - -static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp) { - if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) { - stbi__rewind(j->s); - return 0; - } - if (x) - *x = j->s->img_x; - if (y) - *y = j->s->img_y; - if (comp) - *comp = j->s->img_n >= 3 ? 3 : 1; - return 1; -} - -static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) { - int result; - stbi__jpeg *j = (stbi__jpeg *)(stbi__malloc(sizeof(stbi__jpeg))); - j->s = s; - result = stbi__jpeg_info_raw(j, x, y, comp); - STBI_FREE(j); - return result; -} -#endif - -// public domain zlib decode v0.2 Sean Barrett 2006-11-18 -// simple implementation -// - all input must be provided in an upfront buffer -// - all output is written to a single output buffer (can malloc/realloc) -// performance -// - fast huffman - -#ifndef STBI_NO_ZLIB - -// fast-way is faster to check than jpeg huffman, but slow way is slower -#define STBI__ZFAST_BITS 9 // accelerate all cases in default tables -#define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) - -// zlib-style huffman encoding -// (jpegs packs from left, zlib from right, so can't share code) -typedef struct { - stbi__uint16 fast[1 << STBI__ZFAST_BITS]; - stbi__uint16 firstcode[16]; - int maxcode[17]; - stbi__uint16 firstsymbol[16]; - stbi_uc size[288]; - stbi__uint16 value[288]; -} stbi__zhuffman; - -stbi_inline static int stbi__bitreverse16(int n) { - n = ((n & 0xAAAA) >> 1) | ((n & 0x5555) << 1); - n = ((n & 0xCCCC) >> 2) | ((n & 0x3333) << 2); - n = ((n & 0xF0F0) >> 4) | ((n & 0x0F0F) << 4); - n = ((n & 0xFF00) >> 8) | ((n & 0x00FF) << 8); - return n; -} - -stbi_inline static int stbi__bit_reverse(int v, int bits) { - STBI_ASSERT(bits <= 16); - // to bit reverse n bits, reverse 16 and shift - // e.g. 11 bits, bit reverse and shift away 5 - return stbi__bitreverse16(v) >> (16 - bits); -} - -static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, - int num) { - int i, k = 0; - int code, next_code[16], sizes[17]; - - // DEFLATE spec for generating codes - memset(sizes, 0, sizeof(sizes)); - memset(z->fast, 0, sizeof(z->fast)); - for (i = 0; i < num; ++i) - ++sizes[sizelist[i]]; - sizes[0] = 0; - for (i = 1; i < 16; ++i) - if (sizes[i] > (1 << i)) - return stbi__err("bad sizes", "Corrupt PNG"); - code = 0; - for (i = 1; i < 16; ++i) { - next_code[i] = code; - z->firstcode[i] = (stbi__uint16)code; - z->firstsymbol[i] = (stbi__uint16)k; - code = (code + sizes[i]); - if (sizes[i]) - if (code - 1 >= (1 << i)) - return stbi__err("bad codelengths", "Corrupt PNG"); - z->maxcode[i] = code << (16 - i); // preshift for inner loop - code <<= 1; - k += sizes[i]; - } - z->maxcode[16] = 0x10000; // sentinel - for (i = 0; i < num; ++i) { - int s = sizelist[i]; - if (s) { - int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s]; - stbi__uint16 fastv = (stbi__uint16)((s << 9) | i); - z->size[c] = (stbi_uc)s; - z->value[c] = (stbi__uint16)i; - if (s <= STBI__ZFAST_BITS) { - int j = stbi__bit_reverse(next_code[s], s); - while (j < (1 << STBI__ZFAST_BITS)) { - z->fast[j] = fastv; - j += (1 << s); - } - } - ++next_code[s]; - } - } - return 1; -} - -// zlib-from-memory implementation for PNG reading -// because PNG allows splitting the zlib stream arbitrarily, -// and it's annoying structurally to have PNG call ZLIB call PNG, -// we require PNG read all the IDATs and combine them into a single -// memory buffer - -typedef struct { - stbi_uc *zbuffer, *zbuffer_end; - int num_bits; - stbi__uint32 code_buffer; - - char *zout; - char *zout_start; - char *zout_end; - int z_expandable; - - stbi__zhuffman z_length, z_distance; -} stbi__zbuf; - -stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z) { - if (z->zbuffer >= z->zbuffer_end) - return 0; - return *z->zbuffer++; -} - -static void stbi__fill_bits(stbi__zbuf *z) { - do { - STBI_ASSERT(z->code_buffer < (1U << z->num_bits)); - z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits; - z->num_bits += 8; - } while (z->num_bits <= 24); -} - -stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n) { - unsigned int k; - if (z->num_bits < n) - stbi__fill_bits(z); - k = z->code_buffer & ((1 << n) - 1); - z->code_buffer >>= n; - z->num_bits -= n; - return k; -} - -static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) { - int b, s, k; - // not resolved by fast table, so compute it the slow way - // use jpeg approach, which requires MSbits at top - k = stbi__bit_reverse(a->code_buffer, 16); - for (s = STBI__ZFAST_BITS + 1;; ++s) - if (k < z->maxcode[s]) - break; - if (s == 16) - return -1; // invalid code! - // code size is s, so: - b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s]; - STBI_ASSERT(z->size[b] == s); - a->code_buffer >>= s; - a->num_bits -= s; - return z->value[b]; -} - -stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) { - int b, s; - if (a->num_bits < 16) - stbi__fill_bits(a); - b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; - if (b) { - s = b >> 9; - a->code_buffer >>= s; - a->num_bits -= s; - return b & 511; - } - return stbi__zhuffman_decode_slowpath(a, z); -} - -static int stbi__zexpand(stbi__zbuf *z, char *zout, - int n) // need to make room for n bytes -{ - char *q; - int cur, limit, old_limit __attribute__((unused)); - z->zout = zout; - if (!z->z_expandable) - return stbi__err("output buffer limit", "Corrupt PNG"); - cur = (int)(z->zout - z->zout_start); - limit = old_limit = (int)(z->zout_end - z->zout_start); - while (cur + n > limit) - limit *= 2; - q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit); - STBI_NOTUSED(old_limit); - if (q == NULL) - return stbi__err("outofmem", "Out of memory"); - z->zout_start = q; - z->zout = q + cur; - z->zout_end = q + limit; - return 1; -} - -static const int stbi__zlength_base[31] = { - 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, - 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; - -static const int stbi__zlength_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, - 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, - 4, 4, 5, 5, 5, 5, 0, 0, 0}; - -static const int stbi__zdist_base[32] = { - 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, - 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0, 0}; - -static const int stbi__zdist_extra[32] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, - 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, - 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; - -static int stbi__parse_huffman_block(stbi__zbuf *a) { - char *zout = a->zout; - for (;;) { - int z = stbi__zhuffman_decode(a, &a->z_length); - if (z < 256) { - if (z < 0) - return stbi__err("bad huffman code", - "Corrupt PNG"); // error in huffman codes - if (zout >= a->zout_end) { - if (!stbi__zexpand(a, zout, 1)) - return 0; - zout = a->zout; - } - *zout++ = (char)z; - } else { - stbi_uc *p; - int len, dist; - if (z == 256) { - a->zout = zout; - return 1; - } - z -= 257; - len = stbi__zlength_base[z]; - if (stbi__zlength_extra[z]) - len += stbi__zreceive(a, stbi__zlength_extra[z]); - z = stbi__zhuffman_decode(a, &a->z_distance); - if (z < 0) - return stbi__err("bad huffman code", "Corrupt PNG"); - dist = stbi__zdist_base[z]; - if (stbi__zdist_extra[z]) - dist += stbi__zreceive(a, stbi__zdist_extra[z]); - if (zout - a->zout_start < dist) - return stbi__err("bad dist", "Corrupt PNG"); - if (zout + len > a->zout_end) { - if (!stbi__zexpand(a, zout, len)) - return 0; - zout = a->zout; - } - p = (stbi_uc *)(zout - dist); - if (dist == 1) { // run of one byte; common in images. - stbi_uc v = *p; - if (len) { - do - *zout++ = v; - while (--len); - } - } else { - if (len) { - do - *zout++ = *p++; - while (--len); - } - } - } - } -} - -static int stbi__compute_huffman_codes(stbi__zbuf *a) { - static const stbi_uc length_dezigzag[19] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; - stbi__zhuffman z_codelength; - stbi_uc lencodes[286 + 32 + 137]; // padding for maximum single op - stbi_uc codelength_sizes[19]; - int i, n; - - int hlit = stbi__zreceive(a, 5) + 257; - int hdist = stbi__zreceive(a, 5) + 1; - int hclen = stbi__zreceive(a, 4) + 4; - int ntot = hlit + hdist; - - memset(codelength_sizes, 0, sizeof(codelength_sizes)); - for (i = 0; i < hclen; ++i) { - int s = stbi__zreceive(a, 3); - codelength_sizes[length_dezigzag[i]] = (stbi_uc)s; - } - if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) - return 0; - - n = 0; - while (n < ntot) { - int c = stbi__zhuffman_decode(a, &z_codelength); - if (c < 0 || c >= 19) - return stbi__err("bad codelengths", "Corrupt PNG"); - if (c < 16) - lencodes[n++] = (stbi_uc)c; - else { - stbi_uc fill = 0; - if (c == 16) { - c = stbi__zreceive(a, 2) + 3; - if (n == 0) - return stbi__err("bad codelengths", "Corrupt PNG"); - fill = lencodes[n - 1]; - } else if (c == 17) - c = stbi__zreceive(a, 3) + 3; - else { - STBI_ASSERT(c == 18); - c = stbi__zreceive(a, 7) + 11; - } - if (ntot - n < c) - return stbi__err("bad codelengths", "Corrupt PNG"); - memset(lencodes + n, fill, c); - n += c; - } - } - if (n != ntot) - return stbi__err("bad codelengths", "Corrupt PNG"); - if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) - return 0; - if (!stbi__zbuild_huffman(&a->z_distance, lencodes + hlit, hdist)) - return 0; - return 1; -} - -static int stbi__parse_uncompressed_block(stbi__zbuf *a) { - stbi_uc header[4]; - int len, nlen, k; - if (a->num_bits & 7) - stbi__zreceive(a, a->num_bits & 7); // discard - // drain the bit-packed data into header - k = 0; - while (a->num_bits > 0) { - header[k++] = - (stbi_uc)(a->code_buffer & 255); // suppress MSVC run-time check - a->code_buffer >>= 8; - a->num_bits -= 8; - } - STBI_ASSERT(a->num_bits == 0); - // now fill header the normal way - while (k < 4) - header[k++] = stbi__zget8(a); - len = header[1] * 256 + header[0]; - nlen = header[3] * 256 + header[2]; - if (nlen != (len ^ 0xffff)) - return stbi__err("zlib corrupt", "Corrupt PNG"); - if (a->zbuffer + len > a->zbuffer_end) - return stbi__err("read past buffer", "Corrupt PNG"); - if (a->zout + len > a->zout_end) - if (!stbi__zexpand(a, a->zout, len)) - return 0; - memcpy(a->zout, a->zbuffer, len); - a->zbuffer += len; - a->zout += len; - return 1; -} - -static int stbi__parse_zlib_header(stbi__zbuf *a) { - int cmf = stbi__zget8(a); - int cm = cmf & 15; - /* int cinfo = cmf >> 4; */ - int flg = stbi__zget8(a); - if ((cmf * 256 + flg) % 31 != 0) - return stbi__err("bad zlib header", "Corrupt PNG"); // zlib spec - if (flg & 32) - return stbi__err("no preset dict", - "Corrupt PNG"); // preset dictionary not allowed in png - if (cm != 8) - return stbi__err("bad compression", - "Corrupt PNG"); // DEFLATE required for png - // window = 1 << (8 + cinfo)... but who cares, we fully buffer output - return 1; -} - -static const stbi_uc stbi__zdefault_length[288] = { - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8}; -static const stbi_uc stbi__zdefault_distance[32] = { - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; -/* -Init algorithm: -{ - int i; // use <= to match clearly with spec - for (i=0; i <= 143; ++i) stbi__zdefault_length[i] = 8; - for ( ; i <= 255; ++i) stbi__zdefault_length[i] = 9; - for ( ; i <= 279; ++i) stbi__zdefault_length[i] = 7; - for ( ; i <= 287; ++i) stbi__zdefault_length[i] = 8; - - for (i=0; i <= 31; ++i) stbi__zdefault_distance[i] = 5; -} -*/ - -static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) { - int final, type; - if (parse_header) - if (!stbi__parse_zlib_header(a)) - return 0; - a->num_bits = 0; - a->code_buffer = 0; - do { - final = stbi__zreceive(a, 1); - type = stbi__zreceive(a, 2); - if (type == 0) { - if (!stbi__parse_uncompressed_block(a)) - return 0; - } else if (type == 3) { - return 0; - } else { - if (type == 1) { - // use fixed code lengths - if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288)) - return 0; - if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) - return 0; - } else { - if (!stbi__compute_huffman_codes(a)) - return 0; - } - if (!stbi__parse_huffman_block(a)) - return 0; - } - } while (!final); - return 1; -} - -static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, - int parse_header) { - a->zout_start = obuf; - a->zout = obuf; - a->zout_end = obuf + olen; - a->z_expandable = exp; - - return stbi__parse_zlib(a, parse_header); -} - -STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, - int initial_size, int *outlen) { - stbi__zbuf a; - char *p = (char *)stbi__malloc(initial_size); - if (p == NULL) - return NULL; - a.zbuffer = (stbi_uc *)buffer; - a.zbuffer_end = (stbi_uc *)buffer + len; - if (stbi__do_zlib(&a, p, initial_size, 1, 1)) { - if (outlen) - *outlen = (int)(a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } -} - -STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, - int *outlen) { - return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen); -} - -STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, - int len, - int initial_size, - int *outlen, - int parse_header) { - stbi__zbuf a; - char *p = (char *)stbi__malloc(initial_size); - if (p == NULL) - return NULL; - a.zbuffer = (stbi_uc *)buffer; - a.zbuffer_end = (stbi_uc *)buffer + len; - if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) { - if (outlen) - *outlen = (int)(a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } -} - -STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, - char const *ibuffer, int ilen) { - stbi__zbuf a; - a.zbuffer = (stbi_uc *)ibuffer; - a.zbuffer_end = (stbi_uc *)ibuffer + ilen; - if (stbi__do_zlib(&a, obuffer, olen, 0, 1)) - return (int)(a.zout - a.zout_start); - else - return -1; -} - -STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, - int *outlen) { - stbi__zbuf a; - char *p = (char *)stbi__malloc(16384); - if (p == NULL) - return NULL; - a.zbuffer = (stbi_uc *)buffer; - a.zbuffer_end = (stbi_uc *)buffer + len; - if (stbi__do_zlib(&a, p, 16384, 1, 0)) { - if (outlen) - *outlen = (int)(a.zout - a.zout_start); - return a.zout_start; - } else { - STBI_FREE(a.zout_start); - return NULL; - } -} - -STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, - const char *ibuffer, int ilen) { - stbi__zbuf a; - a.zbuffer = (stbi_uc *)ibuffer; - a.zbuffer_end = (stbi_uc *)ibuffer + ilen; - if (stbi__do_zlib(&a, obuffer, olen, 0, 0)) - return (int)(a.zout - a.zout_start); - else - return -1; -} -#endif - -// public domain "baseline" PNG decoder v0.10 Sean Barrett 2006-11-18 -// simple implementation -// - only 8-bit samples -// - no CRC checking -// - allocates lots of intermediate memory -// - avoids problem of streaming data between subsystems -// - avoids explicit window management -// performance -// - uses stb_zlib, a PD zlib implementation with fast huffman decoding - -#ifndef STBI_NO_PNG -typedef struct { - stbi__uint32 length; - stbi__uint32 type; -} stbi__pngchunk; - -static stbi__pngchunk stbi__get_chunk_header(stbi__context *s) { - stbi__pngchunk c; - c.length = stbi__get32be(s); - c.type = stbi__get32be(s); - return c; -} - -static int stbi__check_png_header(stbi__context *s) { - static const stbi_uc png_sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; - int i; - for (i = 0; i < 8; ++i) - if (stbi__get8(s) != png_sig[i]) - return stbi__err("bad png sig", "Not a PNG"); - return 1; -} - -typedef struct { - stbi__context *s; - stbi_uc *idata, *expanded, *out; - int depth; -} stbi__png; - -enum { - STBI__F_none = 0, - STBI__F_sub = 1, - STBI__F_up = 2, - STBI__F_avg = 3, - STBI__F_paeth = 4, - // synthetic filters used for first scanline to avoid needing a dummy row of - // 0s - STBI__F_avg_first, - STBI__F_paeth_first -}; - -static stbi_uc first_row_filter[5] = {STBI__F_none, STBI__F_sub, STBI__F_none, - STBI__F_avg_first, STBI__F_paeth_first}; - -static int stbi__paeth(int a, int b, int c) { - int p = a + b - c; - int pa = abs(p - a); - int pb = abs(p - b); - int pc = abs(p - c); - if (pa <= pb && pa <= pc) - return a; - if (pb <= pc) - return b; - return c; -} - -static const stbi_uc stbi__depth_scale_table[9] = {0, 0xff, 0x55, 0, 0x11, - 0, 0, 0, 0x01}; - -// create the png data from post-deflated data -static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, - stbi__uint32 raw_len, int out_n, - stbi__uint32 x, stbi__uint32 y, int depth, - int color) { - int bytes = (depth == 16 ? 2 : 1); - stbi__context *s = a->s; - stbi__uint32 i, j, stride = x * out_n * bytes; - stbi__uint32 img_len, img_width_bytes; - int k; - int img_n = s->img_n; // copy it into a local for later - - int output_bytes = out_n * bytes; - int filter_bytes = img_n * bytes; - int width = x; - - STBI_ASSERT(out_n == s->img_n || out_n == s->img_n + 1); - a->out = (stbi_uc *)stbi__malloc_mad3( - x, y, output_bytes, 0); // extra bytes to write off the end into - if (!a->out) - return stbi__err("outofmem", "Out of memory"); - - if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) - return stbi__err("too large", "Corrupt PNG"); - img_width_bytes = (((img_n * x * depth) + 7) >> 3); - img_len = (img_width_bytes + 1) * y; - - // we used to check for exact match between raw_len and img_len on - // non-interlaced PNGs, but issue #276 reported a PNG in the wild that had - // extra data at the end (all zeros), so just check for raw_len < img_len - // always. - if (raw_len < img_len) - return stbi__err("not enough pixels", "Corrupt PNG"); - - for (j = 0; j < y; ++j) { - stbi_uc *cur = a->out + stride * j; - stbi_uc *prior; - int filter = *raw++; - - if (filter > 4) - return stbi__err("invalid filter", "Corrupt PNG"); - - if (depth < 8) { - STBI_ASSERT(img_width_bytes <= x); - cur += - x * out_n - img_width_bytes; // store output to the rightmost img_len - // bytes, so we can decode in place - filter_bytes = 1; - width = img_width_bytes; - } - prior = - cur - - stride; // bugfix: need to compute this after 'cur +=' computation above - - // if first row, use special filter that doesn't sample previous row - if (j == 0) - filter = first_row_filter[filter]; - - // handle first byte explicitly - for (k = 0; k < filter_bytes; ++k) { - switch (filter) { - case STBI__F_none: - cur[k] = raw[k]; - break; - case STBI__F_sub: - cur[k] = raw[k]; - break; - case STBI__F_up: - cur[k] = STBI__BYTECAST(raw[k] + prior[k]); - break; - case STBI__F_avg: - cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1)); - break; - case STBI__F_paeth: - cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0)); - break; - case STBI__F_avg_first: - cur[k] = raw[k]; - break; - case STBI__F_paeth_first: - cur[k] = raw[k]; - break; - } - } - - if (depth == 8) { - if (img_n != out_n) - cur[img_n] = 255; // first pixel - raw += img_n; - cur += out_n; - prior += out_n; - } else if (depth == 16) { - if (img_n != out_n) { - cur[filter_bytes] = 255; // first pixel top byte - cur[filter_bytes + 1] = 255; // first pixel bottom byte - } - raw += filter_bytes; - cur += output_bytes; - prior += output_bytes; - } else { - raw += 1; - cur += 1; - prior += 1; - } - - // this is a little gross, so that we don't switch per-pixel or - // per-component - if (depth < 8 || img_n == out_n) { - int nk = (width - 1) * filter_bytes; -#define STBI__CASE(f) \ - case f: \ - for (k = 0; k < nk; ++k) - switch (filter) { - // "none" filter turns into a memcpy here; make that explicit. - case STBI__F_none: - memcpy(cur, raw, nk); - break; - STBI__CASE(STBI__F_sub) { - cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]); - } - break; - STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } - break; - STBI__CASE(STBI__F_avg) { - cur[k] = STBI__BYTECAST(raw[k] + - ((prior[k] + cur[k - filter_bytes]) >> 1)); - } - break; - STBI__CASE(STBI__F_paeth) { - cur[k] = STBI__BYTECAST(raw[k] + - stbi__paeth(cur[k - filter_bytes], prior[k], - prior[k - filter_bytes])); - } - break; - STBI__CASE(STBI__F_avg_first) { - cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1)); - } - break; - STBI__CASE(STBI__F_paeth_first) { - cur[k] = - STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - filter_bytes], 0, 0)); - } - break; - } -#undef STBI__CASE - raw += nk; - } else { - STBI_ASSERT(img_n + 1 == out_n); -#define STBI__CASE(f) \ - case f: \ - for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, \ - cur += output_bytes, prior += output_bytes) \ - for (k = 0; k < filter_bytes; ++k) - switch (filter) { - STBI__CASE(STBI__F_none) { cur[k] = raw[k]; } - break; - STBI__CASE(STBI__F_sub) { - cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]); - } - break; - STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } - break; - STBI__CASE(STBI__F_avg) { - cur[k] = STBI__BYTECAST(raw[k] + - ((prior[k] + cur[k - output_bytes]) >> 1)); - } - break; - STBI__CASE(STBI__F_paeth) { - cur[k] = STBI__BYTECAST(raw[k] + - stbi__paeth(cur[k - output_bytes], prior[k], - prior[k - output_bytes])); - } - break; - STBI__CASE(STBI__F_avg_first) { - cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1)); - } - break; - STBI__CASE(STBI__F_paeth_first) { - cur[k] = - STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0)); - } - break; - } -#undef STBI__CASE - - // the loop above sets the high byte of the pixels' alpha, but for - // 16 bit png files we also need the low byte set. we'll do that here. - if (depth == 16) { - cur = a->out + stride * j; // start at the beginning of the row again - for (i = 0; i < x; ++i, cur += output_bytes) { - cur[filter_bytes + 1] = 255; - } - } - } - } - - // we make a separate pass to expand bits to pixels; for performance, - // this could run two scanlines behind the above code, so it won't - // intefere with filtering but will still be in the cache. - if (depth < 8) { - for (j = 0; j < y; ++j) { - stbi_uc *cur = a->out + stride * j; - stbi_uc *in = a->out + stride * j + x * out_n - img_width_bytes; - // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common - // 8-bit path optimal at minimal cost for 1/2/4-bit png guarante byte - // alignment, if width is not multiple of 8/4/2 we'll decode dummy - // trailing data that will be skipped in the later loop - stbi_uc scale = (color == 0) - ? stbi__depth_scale_table[depth] - : 1; // scale grayscale values to 0..255 range - - // note that the final byte might overshoot and write more data than - // desired. we can allocate enough data that this never writes out of - // memory, but it could also overwrite the next scanline. can it overwrite - // non-empty data on the next scanline? yes, consider 1-pixel-wide - // scanlines with 1-bit-per-pixel. so we need to explicitly clamp the - // final ones - - if (depth == 4) { - for (k = x * img_n; k >= 2; k -= 2, ++in) { - *cur++ = scale * ((*in >> 4)); - *cur++ = scale * ((*in) & 0x0f); - } - if (k > 0) - *cur++ = scale * ((*in >> 4)); - } else if (depth == 2) { - for (k = x * img_n; k >= 4; k -= 4, ++in) { - *cur++ = scale * ((*in >> 6)); - *cur++ = scale * ((*in >> 4) & 0x03); - *cur++ = scale * ((*in >> 2) & 0x03); - *cur++ = scale * ((*in) & 0x03); - } - if (k > 0) - *cur++ = scale * ((*in >> 6)); - if (k > 1) - *cur++ = scale * ((*in >> 4) & 0x03); - if (k > 2) - *cur++ = scale * ((*in >> 2) & 0x03); - } else if (depth == 1) { - for (k = x * img_n; k >= 8; k -= 8, ++in) { - *cur++ = scale * ((*in >> 7)); - *cur++ = scale * ((*in >> 6) & 0x01); - *cur++ = scale * ((*in >> 5) & 0x01); - *cur++ = scale * ((*in >> 4) & 0x01); - *cur++ = scale * ((*in >> 3) & 0x01); - *cur++ = scale * ((*in >> 2) & 0x01); - *cur++ = scale * ((*in >> 1) & 0x01); - *cur++ = scale * ((*in) & 0x01); - } - if (k > 0) - *cur++ = scale * ((*in >> 7)); - if (k > 1) - *cur++ = scale * ((*in >> 6) & 0x01); - if (k > 2) - *cur++ = scale * ((*in >> 5) & 0x01); - if (k > 3) - *cur++ = scale * ((*in >> 4) & 0x01); - if (k > 4) - *cur++ = scale * ((*in >> 3) & 0x01); - if (k > 5) - *cur++ = scale * ((*in >> 2) & 0x01); - if (k > 6) - *cur++ = scale * ((*in >> 1) & 0x01); - } - if (img_n != out_n) { - int q; - // insert alpha = 255 - cur = a->out + stride * j; - if (img_n == 1) { - for (q = x - 1; q >= 0; --q) { - cur[q * 2 + 1] = 255; - cur[q * 2 + 0] = cur[q]; - } - } else { - STBI_ASSERT(img_n == 3); - for (q = x - 1; q >= 0; --q) { - cur[q * 4 + 3] = 255; - cur[q * 4 + 2] = cur[q * 3 + 2]; - cur[q * 4 + 1] = cur[q * 3 + 1]; - cur[q * 4 + 0] = cur[q * 3 + 0]; - } - } - } - } - } else if (depth == 16) { - // force the image data from big-endian to platform-native. - // this is done in a separate pass due to the decoding relying - // on the data being untouched, but could probably be done - // per-line during decode if care is taken. - stbi_uc *cur = a->out; - stbi__uint16 *cur16 = (stbi__uint16 *)cur; - - for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) { - *cur16 = (cur[0] << 8) | cur[1]; - } - } - - return 1; -} - -static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, - stbi__uint32 image_data_len, int out_n, - int depth, int color, int interlaced) { - int bytes = (depth == 16 ? 2 : 1); - int out_bytes = out_n * bytes; - stbi_uc *final; - int p; - if (!interlaced) - return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, - a->s->img_x, a->s->img_y, depth, color); - - // de-interlacing - final = (stbi_uc *)stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); - for (p = 0; p < 7; ++p) { - int xorig[] = {0, 4, 0, 2, 0, 1, 0}; - int yorig[] = {0, 0, 4, 0, 2, 0, 1}; - int xspc[] = {8, 8, 4, 4, 2, 2, 1}; - int yspc[] = {8, 8, 8, 4, 4, 2, 2}; - int i, j, x, y; - // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1 - x = (a->s->img_x - xorig[p] + xspc[p] - 1) / xspc[p]; - y = (a->s->img_y - yorig[p] + yspc[p] - 1) / yspc[p]; - if (x && y) { - stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y; - if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, - y, depth, color)) { - STBI_FREE(final); - return 0; - } - for (j = 0; j < y; ++j) { - for (i = 0; i < x; ++i) { - int out_y = j * yspc[p] + yorig[p]; - int out_x = i * xspc[p] + xorig[p]; - memcpy(final + out_y * a->s->img_x * out_bytes + out_x * out_bytes, - a->out + (j * x + i) * out_bytes, out_bytes); - } - } - STBI_FREE(a->out); - image_data += img_len; - image_data_len -= img_len; - } - } - a->out = final; - - return 1; -} - -static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n) { - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi_uc *p = z->out; - - // compute color-based transparency, assuming we've - // already got 255 as the alpha value in the output - STBI_ASSERT(out_n == 2 || out_n == 4); - - if (out_n == 2) { - for (i = 0; i < pixel_count; ++i) { - p[1] = (p[0] == tc[0] ? 0 : 255); - p += 2; - } - } else { - for (i = 0; i < pixel_count; ++i) { - if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) - p[3] = 0; - p += 4; - } - } - return 1; -} - -static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], - int out_n) { - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi__uint16 *p = (stbi__uint16 *)z->out; - - // compute color-based transparency, assuming we've - // already got 65535 as the alpha value in the output - STBI_ASSERT(out_n == 2 || out_n == 4); - - if (out_n == 2) { - for (i = 0; i < pixel_count; ++i) { - p[1] = (p[0] == tc[0] ? 0 : 65535); - p += 2; - } - } else { - for (i = 0; i < pixel_count; ++i) { - if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2]) - p[3] = 0; - p += 4; - } - } - return 1; -} - -static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, - int pal_img_n) { - stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y; - stbi_uc *p, *temp_out, *orig = a->out; - - p = (stbi_uc *)stbi__malloc_mad2(pixel_count, pal_img_n, 0); - if (p == NULL) - return stbi__err("outofmem", "Out of memory"); - - // between here and free(out) below, exitting would leak - temp_out = p; - - if (pal_img_n == 3) { - for (i = 0; i < pixel_count; ++i) { - int n = orig[i] * 4; - p[0] = palette[n]; - p[1] = palette[n + 1]; - p[2] = palette[n + 2]; - p += 3; - } - } else { - for (i = 0; i < pixel_count; ++i) { - int n = orig[i] * 4; - p[0] = palette[n]; - p[1] = palette[n + 1]; - p[2] = palette[n + 2]; - p[3] = palette[n + 3]; - p += 4; - } - } - STBI_FREE(a->out); - a->out = temp_out; - - STBI_NOTUSED(len); - - return 1; -} - -static int stbi__unpremultiply_on_load = 0; -static int stbi__de_iphone_flag = 0; - -STBIDEF void -stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) { - stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply; -} - -STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) { - stbi__de_iphone_flag = flag_true_if_should_convert; -} - -static void stbi__de_iphone(stbi__png *z) { - stbi__context *s = z->s; - stbi__uint32 i, pixel_count = s->img_x * s->img_y; - stbi_uc *p = z->out; - - if (s->img_out_n == 3) { // convert bgr to rgb - for (i = 0; i < pixel_count; ++i) { - stbi_uc t = p[0]; - p[0] = p[2]; - p[2] = t; - p += 3; - } - } else { - STBI_ASSERT(s->img_out_n == 4); - if (stbi__unpremultiply_on_load) { - // convert bgr to rgb and unpremultiply - for (i = 0; i < pixel_count; ++i) { - stbi_uc a = p[3]; - stbi_uc t = p[0]; - if (a) { - stbi_uc half = a / 2; - p[0] = (p[2] * 255 + half) / a; - p[1] = (p[1] * 255 + half) / a; - p[2] = (t * 255 + half) / a; - } else { - p[0] = p[2]; - p[2] = t; - } - p += 4; - } - } else { - // convert bgr to rgb - for (i = 0; i < pixel_count; ++i) { - stbi_uc t = p[0]; - p[0] = p[2]; - p[2] = t; - p += 4; - } - } - } -} - -#define STBI__PNG_TYPE(a, b, c, d) \ - (((unsigned)(a) << 24) + ((unsigned)(b) << 16) + ((unsigned)(c) << 8) + \ - (unsigned)(d)) - -static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) { - stbi_uc palette[1024], pal_img_n = 0; - stbi_uc has_trans = 0, tc[3] = {0}; - stbi__uint16 tc16[3]; - stbi__uint32 ioff = 0, idata_limit = 0, i, pal_len = 0; - int first = 1, k, interlace = 0, color = 0, is_iphone = 0; - stbi__context *s = z->s; - - z->expanded = NULL; - z->idata = NULL; - z->out = NULL; - - if (!stbi__check_png_header(s)) - return 0; - - if (scan == STBI__SCAN_type) - return 1; - - for (;;) { - stbi__pngchunk c = stbi__get_chunk_header(s); - switch (c.type) { - case STBI__PNG_TYPE('C', 'g', 'B', 'I'): - is_iphone = 1; - stbi__skip(s, c.length); - break; - case STBI__PNG_TYPE('I', 'H', 'D', 'R'): { - int comp, filter; - if (!first) - return stbi__err("multiple IHDR", "Corrupt PNG"); - first = 0; - if (c.length != 13) - return stbi__err("bad IHDR len", "Corrupt PNG"); - s->img_x = stbi__get32be(s); - if (s->img_x > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - s->img_y = stbi__get32be(s); - if (s->img_y > (1 << 24)) - return stbi__err("too large", "Very large image (corrupt?)"); - z->depth = stbi__get8(s); - if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && - z->depth != 16) - return stbi__err("1/2/4/8/16-bit only", - "PNG not supported: 1/2/4/8/16-bit only"); - color = stbi__get8(s); - if (color > 6) - return stbi__err("bad ctype", "Corrupt PNG"); - if (color == 3 && z->depth == 16) - return stbi__err("bad ctype", "Corrupt PNG"); - if (color == 3) - pal_img_n = 3; - else if (color & 1) - return stbi__err("bad ctype", "Corrupt PNG"); - comp = stbi__get8(s); - if (comp) - return stbi__err("bad comp method", "Corrupt PNG"); - filter = stbi__get8(s); - if (filter) - return stbi__err("bad filter method", "Corrupt PNG"); - interlace = stbi__get8(s); - if (interlace > 1) - return stbi__err("bad interlace method", "Corrupt PNG"); - if (!s->img_x || !s->img_y) - return stbi__err("0-pixel image", "Corrupt PNG"); - if (!pal_img_n) { - s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); - if ((1 << 30) / s->img_x / s->img_n < s->img_y) - return stbi__err("too large", "Image too large to decode"); - if (scan == STBI__SCAN_header) - return 1; - } else { - // if paletted, then pal_n is our final components, and - // img_n is # components to decompress/filter. - s->img_n = 1; - if ((1 << 30) / s->img_x / 4 < s->img_y) - return stbi__err("too large", "Corrupt PNG"); - // if SCAN_header, have to scan to see if we have a tRNS - } - break; - } - - case STBI__PNG_TYPE('P', 'L', 'T', 'E'): { - if (first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if (c.length > 256 * 3) - return stbi__err("invalid PLTE", "Corrupt PNG"); - pal_len = c.length / 3; - if (pal_len * 3 != c.length) - return stbi__err("invalid PLTE", "Corrupt PNG"); - for (i = 0; i < pal_len; ++i) { - palette[i * 4 + 0] = stbi__get8(s); - palette[i * 4 + 1] = stbi__get8(s); - palette[i * 4 + 2] = stbi__get8(s); - palette[i * 4 + 3] = 255; - } - break; - } - - case STBI__PNG_TYPE('t', 'R', 'N', 'S'): { - if (first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if (z->idata) - return stbi__err("tRNS after IDAT", "Corrupt PNG"); - if (pal_img_n) { - if (scan == STBI__SCAN_header) { - s->img_n = 4; - return 1; - } - if (pal_len == 0) - return stbi__err("tRNS before PLTE", "Corrupt PNG"); - if (c.length > pal_len) - return stbi__err("bad tRNS len", "Corrupt PNG"); - pal_img_n = 4; - for (i = 0; i < c.length; ++i) - palette[i * 4 + 3] = stbi__get8(s); - } else { - if (!(s->img_n & 1)) - return stbi__err("tRNS with alpha", "Corrupt PNG"); - if (c.length != (stbi__uint32)s->img_n * 2) - return stbi__err("bad tRNS len", "Corrupt PNG"); - has_trans = 1; - if (z->depth == 16) { - for (k = 0; k < s->img_n; ++k) - tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is - } else { - for (k = 0; k < s->img_n; ++k) - tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * - stbi__depth_scale_table[z->depth]; // non 8-bit images will - // be larger - } - } - break; - } - - case STBI__PNG_TYPE('I', 'D', 'A', 'T'): { - if (first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if (pal_img_n && !pal_len) - return stbi__err("no PLTE", "Corrupt PNG"); - if (scan == STBI__SCAN_header) { - s->img_n = pal_img_n; - return 1; - } - if ((int)(ioff + c.length) < (int)ioff) - return 0; - if (ioff + c.length > idata_limit) { - stbi__uint32 idata_limit_old __attribute__((unused)) = idata_limit; - stbi_uc *p; - if (idata_limit == 0) - idata_limit = c.length > 4096 ? c.length : 4096; - while (ioff + c.length > idata_limit) - idata_limit *= 2; - STBI_NOTUSED(idata_limit_old); - p = (stbi_uc *)STBI_REALLOC_SIZED(z->idata, idata_limit_old, - idata_limit); - if (p == NULL) - return stbi__err("outofmem", "Out of memory"); - z->idata = p; - } - if (!stbi__getn(s, z->idata + ioff, c.length)) - return stbi__err("outofdata", "Corrupt PNG"); - ioff += c.length; - break; - } - - case STBI__PNG_TYPE('I', 'E', 'N', 'D'): { - stbi__uint32 raw_len, bpl; - if (first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if (scan != STBI__SCAN_load) - return 1; - if (z->idata == NULL) - return stbi__err("no IDAT", "Corrupt PNG"); - // initial guess for decoded data size to avoid unnecessary reallocs - bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component - raw_len = bpl * s->img_y * s->img_n /* pixels */ + - s->img_y /* filter mode per row */; - z->expanded = (stbi_uc *)stbi_zlib_decode_malloc_guesssize_headerflag( - (char *)z->idata, ioff, raw_len, (int *)&raw_len, !is_iphone); - if (z->expanded == NULL) - return 0; // zlib should set error - STBI_FREE(z->idata); - z->idata = NULL; - if ((req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n) || - has_trans) - s->img_out_n = s->img_n + 1; - else - s->img_out_n = s->img_n; - if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, - z->depth, color, interlace)) - return 0; - if (has_trans) { - if (z->depth == 16) { - if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) - return 0; - } else { - if (!stbi__compute_transparency(z, tc, s->img_out_n)) - return 0; - } - } - if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2) - stbi__de_iphone(z); - if (pal_img_n) { - // pal_img_n == 3 or 4 - s->img_n = pal_img_n; // record the actual colors we had - s->img_out_n = pal_img_n; - if (req_comp >= 3) - s->img_out_n = req_comp; - if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n)) - return 0; - } else if (has_trans) { - // non-paletted image with tRNS -> source image has (constant) alpha - ++s->img_n; - } - STBI_FREE(z->expanded); - z->expanded = NULL; - return 1; - } - - default: - // if critical, fail - if (first) - return stbi__err("first not IHDR", "Corrupt PNG"); - if ((c.type & (1 << 29)) == 0) { -#ifndef STBI_NO_FAILURE_STRINGS - // not threadsafe - static char invalid_chunk[] = "XXXX PNG chunk not known"; - invalid_chunk[0] = STBI__BYTECAST(c.type >> 24); - invalid_chunk[1] = STBI__BYTECAST(c.type >> 16); - invalid_chunk[2] = STBI__BYTECAST(c.type >> 8); - invalid_chunk[3] = STBI__BYTECAST(c.type >> 0); -#endif - return stbi__err(invalid_chunk, - "PNG not supported: unknown PNG chunk type"); - } - stbi__skip(s, c.length); - break; - } - // end of PNG chunk, read and skip CRC - stbi__get32be(s); - } -} - -static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, - stbi__result_info *ri) { - void *result = NULL; - if (req_comp < 0 || req_comp > 4) - return stbi__errpuc("bad req_comp", "Internal error"); - if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) { - if (p->depth < 8) - ri->bits_per_channel = 8; - else - ri->bits_per_channel = p->depth; - result = p->out; - p->out = NULL; - if (req_comp && req_comp != p->s->img_out_n) { - if (ri->bits_per_channel == 8) - result = stbi__convert_format((unsigned char *)result, p->s->img_out_n, - req_comp, p->s->img_x, p->s->img_y); - else - result = stbi__convert_format16((stbi__uint16 *)result, p->s->img_out_n, - req_comp, p->s->img_x, p->s->img_y); - p->s->img_out_n = req_comp; - if (result == NULL) - return result; - } - *x = p->s->img_x; - *y = p->s->img_y; - if (n) - *n = p->s->img_n; - } - STBI_FREE(p->out); - p->out = NULL; - STBI_FREE(p->expanded); - p->expanded = NULL; - STBI_FREE(p->idata); - p->idata = NULL; - - return result; -} - -static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - stbi__png p; - p.s = s; - return stbi__do_png(&p, x, y, comp, req_comp, ri); -} - -static int stbi__png_test(stbi__context *s) { - int r; - r = stbi__check_png_header(s); - stbi__rewind(s); - return r; -} - -static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp) { - if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) { - stbi__rewind(p->s); - return 0; - } - if (x) - *x = p->s->img_x; - if (y) - *y = p->s->img_y; - if (comp) - *comp = p->s->img_n; - return 1; -} - -static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp) { - stbi__png p; - p.s = s; - return stbi__png_info_raw(&p, x, y, comp); -} - -static int stbi__png_is16(stbi__context *s) { - stbi__png p; - p.s = s; - if (!stbi__png_info_raw(&p, NULL, NULL, NULL)) - return 0; - if (p.depth != 16) { - stbi__rewind(p.s); - return 0; - } - return 1; -} -#endif - -// Microsoft/Windows BMP image - -#ifndef STBI_NO_BMP -static int stbi__bmp_test_raw(stbi__context *s) { - int r; - int sz; - if (stbi__get8(s) != 'B') - return 0; - if (stbi__get8(s) != 'M') - return 0; - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - stbi__get32le(s); // discard data offset - sz = stbi__get32le(s); - r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124); - return r; -} - -static int stbi__bmp_test(stbi__context *s) { - int r = stbi__bmp_test_raw(s); - stbi__rewind(s); - return r; -} - -// returns 0..31 for the highest set bit -static int stbi__high_bit(unsigned int z) { - int n = 0; - if (z == 0) - return -1; - if (z >= 0x10000) { - n += 16; - z >>= 16; - } - if (z >= 0x00100) { - n += 8; - z >>= 8; - } - if (z >= 0x00010) { - n += 4; - z >>= 4; - } - if (z >= 0x00004) { - n += 2; - z >>= 2; - } - if (z >= 0x00002) { - n += 1; /* >>= 1;*/ - } - return n; -} - -static int stbi__bitcount(unsigned int a) { - a = (a & 0x55555555) + ((a >> 1) & 0x55555555); // max 2 - a = (a & 0x33333333) + ((a >> 2) & 0x33333333); // max 4 - a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits - a = (a + (a >> 8)); // max 16 per 8 bits - a = (a + (a >> 16)); // max 32 per 8 bits - return a & 0xff; -} - -// extract an arbitrarily-aligned N-bit value (N=bits) -// from v, and then make it 8-bits long and fractionally -// extend it to full full range. -static int stbi__shiftsigned(unsigned int v, int shift, int bits) { - static unsigned int mul_table[9] = { - 0, - 0xff /*0b11111111*/, - 0x55 /*0b01010101*/, - 0x49 /*0b01001001*/, - 0x11 /*0b00010001*/, - 0x21 /*0b00100001*/, - 0x41 /*0b01000001*/, - 0x81 /*0b10000001*/, - 0x01 /*0b00000001*/, - }; - static unsigned int shift_table[9] = { - 0, 0, 0, 1, 0, 2, 4, 6, 0, - }; - if (shift < 0) - v <<= -shift; - else - v >>= shift; - STBI_ASSERT(v < 256); - v >>= (8 - bits); - STBI_ASSERT(bits >= 0 && bits <= 8); - return (int)((unsigned)v * mul_table[bits]) >> shift_table[bits]; -} - -typedef struct { - int bpp, offset, hsz; - unsigned int mr, mg, mb, ma, all_a; -} stbi__bmp_data; - -static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) { - int hsz; - if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') - return stbi__errpuc("not BMP", "Corrupt BMP"); - stbi__get32le(s); // discard filesize - stbi__get16le(s); // discard reserved - stbi__get16le(s); // discard reserved - info->offset = stbi__get32le(s); - info->hsz = hsz = stbi__get32le(s); - info->mr = info->mg = info->mb = info->ma = 0; - - if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) - return stbi__errpuc("unknown BMP", "BMP type not supported: unknown"); - if (hsz == 12) { - s->img_x = stbi__get16le(s); - s->img_y = stbi__get16le(s); - } else { - s->img_x = stbi__get32le(s); - s->img_y = stbi__get32le(s); - } - if (stbi__get16le(s) != 1) - return stbi__errpuc("bad BMP", "bad BMP"); - info->bpp = stbi__get16le(s); - if (hsz != 12) { - int compress = stbi__get32le(s); - if (compress == 1 || compress == 2) - return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); - stbi__get32le(s); // discard sizeof - stbi__get32le(s); // discard hres - stbi__get32le(s); // discard vres - stbi__get32le(s); // discard colorsused - stbi__get32le(s); // discard max important - if (hsz == 40 || hsz == 56) { - if (hsz == 56) { - stbi__get32le(s); - stbi__get32le(s); - stbi__get32le(s); - stbi__get32le(s); - } - if (info->bpp == 16 || info->bpp == 32) { - if (compress == 0) { - if (info->bpp == 32) { - info->mr = 0xffu << 16; - info->mg = 0xffu << 8; - info->mb = 0xffu << 0; - info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha - // channel but it was all 0 - } else { - info->mr = 31u << 10; - info->mg = 31u << 5; - info->mb = 31u << 0; - } - } else if (compress == 3) { - info->mr = stbi__get32le(s); - info->mg = stbi__get32le(s); - info->mb = stbi__get32le(s); - // not documented, but generated by photoshop and handled by mspaint - if (info->mr == info->mg && info->mg == info->mb) { - // ?!?!? - return stbi__errpuc("bad BMP", "bad BMP"); - } - } else - return stbi__errpuc("bad BMP", "bad BMP"); - } - } else { - int i; - if (hsz != 108 && hsz != 124) - return stbi__errpuc("bad BMP", "bad BMP"); - info->mr = stbi__get32le(s); - info->mg = stbi__get32le(s); - info->mb = stbi__get32le(s); - info->ma = stbi__get32le(s); - stbi__get32le(s); // discard color space - for (i = 0; i < 12; ++i) - stbi__get32le(s); // discard color space parameters - if (hsz == 124) { - stbi__get32le(s); // discard rendering intent - stbi__get32le(s); // discard offset of profile data - stbi__get32le(s); // discard size of profile data - stbi__get32le(s); // discard reserved - } - } - } - return (void *)1; -} - -static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - stbi_uc *out; - unsigned int mr = 0, mg = 0, mb = 0, ma = 0, all_a; - stbi_uc pal[256][4]; - int psize = 0, i, j, width; - int flip_vertically, pad, target; - stbi__bmp_data info; - STBI_NOTUSED(ri); - - info.all_a = 255; - if (stbi__bmp_parse_header(s, &info) == NULL) - return NULL; // error code already set - - flip_vertically = ((int)s->img_y) > 0; - s->img_y = abs((int)s->img_y); - - mr = info.mr; - mg = info.mg; - mb = info.mb; - ma = info.ma; - all_a = info.all_a; - - if (info.hsz == 12) { - if (info.bpp < 24) - psize = (info.offset - 14 - 24) / 3; - } else { - if (info.bpp < 16) - psize = (info.offset - 14 - info.hsz) >> 2; - } - - if (info.bpp == 24 && ma == 0xff000000) - s->img_n = 3; - else - s->img_n = ma ? 4 : 3; - if (req_comp && req_comp >= 3) // we can directly decode 3 or 4 - target = req_comp; - else - target = s->img_n; // if they want monochrome, we'll post-convert - - // sanity-check size - if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0)) - return stbi__errpuc("too large", "Corrupt BMP"); - - out = (stbi_uc *)stbi__malloc_mad3(target, s->img_x, s->img_y, 0); - if (!out) - return stbi__errpuc("outofmem", "Out of memory"); - if (info.bpp < 16) { - int z = 0; - if (psize == 0 || psize > 256) { - STBI_FREE(out); - return stbi__errpuc("invalid", "Corrupt BMP"); - } - for (i = 0; i < psize; ++i) { - pal[i][2] = stbi__get8(s); - pal[i][1] = stbi__get8(s); - pal[i][0] = stbi__get8(s); - if (info.hsz != 12) - stbi__get8(s); - pal[i][3] = 255; - } - stbi__skip(s, - info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4)); - if (info.bpp == 1) - width = (s->img_x + 7) >> 3; - else if (info.bpp == 4) - width = (s->img_x + 1) >> 1; - else if (info.bpp == 8) - width = s->img_x; - else { - STBI_FREE(out); - return stbi__errpuc("bad bpp", "Corrupt BMP"); - } - pad = (-width) & 3; - if (info.bpp == 1) { - for (j = 0; j < (int)s->img_y; ++j) { - int bit_offset = 7, v = stbi__get8(s); - for (i = 0; i < (int)s->img_x; ++i) { - int color = (v >> bit_offset) & 0x1; - out[z++] = pal[color][0]; - out[z++] = pal[color][1]; - out[z++] = pal[color][2]; - if (target == 4) - out[z++] = 255; - if (i + 1 == (int)s->img_x) - break; - if ((--bit_offset) < 0) { - bit_offset = 7; - v = stbi__get8(s); - } - } - stbi__skip(s, pad); - } - } else { - for (j = 0; j < (int)s->img_y; ++j) { - for (i = 0; i < (int)s->img_x; i += 2) { - int v = stbi__get8(s), v2 = 0; - if (info.bpp == 4) { - v2 = v & 15; - v >>= 4; - } - out[z++] = pal[v][0]; - out[z++] = pal[v][1]; - out[z++] = pal[v][2]; - if (target == 4) - out[z++] = 255; - if (i + 1 == (int)s->img_x) - break; - v = (info.bpp == 8) ? stbi__get8(s) : v2; - out[z++] = pal[v][0]; - out[z++] = pal[v][1]; - out[z++] = pal[v][2]; - if (target == 4) - out[z++] = 255; - } - stbi__skip(s, pad); - } - } - } else { - int rshift = 0, gshift = 0, bshift = 0, ashift = 0, rcount = 0, gcount = 0, - bcount = 0, acount = 0; - int z = 0; - int easy = 0; - stbi__skip(s, info.offset - 14 - info.hsz); - if (info.bpp == 24) - width = 3 * s->img_x; - else if (info.bpp == 16) - width = 2 * s->img_x; - else /* bpp = 32 and pad = 0 */ - width = 0; - pad = (-width) & 3; - if (info.bpp == 24) { - easy = 1; - } else if (info.bpp == 32) { - if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000) - easy = 2; - } - if (!easy) { - if (!mr || !mg || !mb) { - STBI_FREE(out); - return stbi__errpuc("bad masks", "Corrupt BMP"); - } - // right shift amt to put high bit in position #7 - rshift = stbi__high_bit(mr) - 7; - rcount = stbi__bitcount(mr); - gshift = stbi__high_bit(mg) - 7; - gcount = stbi__bitcount(mg); - bshift = stbi__high_bit(mb) - 7; - bcount = stbi__bitcount(mb); - ashift = stbi__high_bit(ma) - 7; - acount = stbi__bitcount(ma); - } - for (j = 0; j < (int)s->img_y; ++j) { - if (easy) { - for (i = 0; i < (int)s->img_x; ++i) { - unsigned char a; - out[z + 2] = stbi__get8(s); - out[z + 1] = stbi__get8(s); - out[z + 0] = stbi__get8(s); - z += 3; - a = (easy == 2 ? stbi__get8(s) : 255); - all_a |= a; - if (target == 4) - out[z++] = a; - } - } else { - int bpp = info.bpp; - for (i = 0; i < (int)s->img_x; ++i) { - stbi__uint32 v = - (bpp == 16 ? (stbi__uint32)stbi__get16le(s) : stbi__get32le(s)); - unsigned int a; - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount)); - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount)); - out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount)); - a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255); - all_a |= a; - if (target == 4) - out[z++] = STBI__BYTECAST(a); - } - } - stbi__skip(s, pad); - } - } - - // if alpha channel is all 0s, replace with all 255s - if (target == 4 && all_a == 0) - for (i = 4 * s->img_x * s->img_y - 1; i >= 0; i -= 4) - out[i] = 255; - - if (flip_vertically) { - stbi_uc t; - for (j = 0; j< ((int)s->img_y >> 1); ++j) { - stbi_uc *p1 = out + j * s->img_x * target; - stbi_uc *p2 = out + (s->img_y - 1 - j) * s->img_x * target; - for (i = 0; i < (int)s->img_x * target; ++i) { - t = p1[i]; - p1[i] = p2[i]; - p2[i] = t; - } - } - } - - if (req_comp && req_comp != target) { - out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y); - if (out == NULL) - return out; // stbi__convert_format frees input on failure - } - - *x = s->img_x; - *y = s->img_y; - if (comp) - *comp = s->img_n; - return out; -} -#endif - -// Targa Truevision - TGA -// by Jonathan Dummer -#ifndef STBI_NO_TGA -// returns STBI_rgb or whatever, 0 on error -static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int *is_rgb16) { - // only RGB or RGBA (incl. 16bit) or grey allowed - if (is_rgb16) - *is_rgb16 = 0; - switch (bits_per_pixel) { - case 8: - return STBI_grey; - case 16: - if (is_grey) - return STBI_grey_alpha; - // fallthrough - case 15: - if (is_rgb16) - *is_rgb16 = 1; - return STBI_rgb; - case 24: // fallthrough - case 32: - return bits_per_pixel / 8; - default: - return 0; - } -} - -static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp) { - int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, - tga_colormap_bpp; - int sz, tga_colormap_type; - stbi__get8(s); // discard Offset - tga_colormap_type = stbi__get8(s); // colormap type - if (tga_colormap_type > 1) { - stbi__rewind(s); - return 0; // only RGB or indexed allowed - } - tga_image_type = stbi__get8(s); // image type - if (tga_colormap_type == 1) { // colormapped (paletted) image - if (tga_image_type != 1 && tga_image_type != 9) { - stbi__rewind(s); - return 0; - } - stbi__skip(s, - 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) { - stbi__rewind(s); - return 0; - } - stbi__skip(s, 4); // skip image x and y origin - tga_colormap_bpp = sz; - } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE - if ((tga_image_type != 2) && (tga_image_type != 3) && - (tga_image_type != 10) && (tga_image_type != 11)) { - stbi__rewind(s); - return 0; // only RGB or grey allowed, +/- RLE - } - stbi__skip(s, 9); // skip colormap specification and image x/y origin - tga_colormap_bpp = 0; - } - tga_w = stbi__get16le(s); - if (tga_w < 1) { - stbi__rewind(s); - return 0; // test width - } - tga_h = stbi__get16le(s); - if (tga_h < 1) { - stbi__rewind(s); - return 0; // test height - } - tga_bits_per_pixel = stbi__get8(s); // bits per pixel - stbi__get8(s); // ignore alpha bits - if (tga_colormap_bpp != 0) { - if ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) { - // when using a colormap, tga_bits_per_pixel is the size of the indexes - // I don't think anything but 8 or 16bit indexes makes sense - stbi__rewind(s); - return 0; - } - tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL); - } else { - tga_comp = stbi__tga_get_comp( - tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), - NULL); - } - if (!tga_comp) { - stbi__rewind(s); - return 0; - } - if (x) - *x = tga_w; - if (y) - *y = tga_h; - if (comp) - *comp = tga_comp; - return 1; // seems to have passed everything -} - -static int stbi__tga_test(stbi__context *s) { - int res = 0; - int sz, tga_color_type; - stbi__get8(s); // discard Offset - tga_color_type = stbi__get8(s); // color type - if (tga_color_type > 1) - goto errorEnd; // only RGB or indexed allowed - sz = stbi__get8(s); // image type - if (tga_color_type == 1) { // colormapped (paletted) image - if (sz != 1 && sz != 9) - goto errorEnd; // colortype 1 demands image type 1 or 9 - stbi__skip(s, - 4); // skip index of first colormap entry and number of entries - sz = stbi__get8(s); // check bits per palette color entry - if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) - goto errorEnd; - stbi__skip(s, 4); // skip image x and y origin - } else { // "normal" image w/o colormap - if ((sz != 2) && (sz != 3) && (sz != 10) && (sz != 11)) - goto errorEnd; // only RGB or grey allowed, +/- RLE - stbi__skip(s, 9); // skip colormap specification and image x/y origin - } - if (stbi__get16le(s) < 1) - goto errorEnd; // test width - if (stbi__get16le(s) < 1) - goto errorEnd; // test height - sz = stbi__get8(s); // bits per pixel - if ((tga_color_type == 1) && (sz != 8) && (sz != 16)) - goto errorEnd; // for colormapped images, bpp is size of an index - if ((sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32)) - goto errorEnd; - - res = 1; // if we got this far, everything's good and we can return 1 instead - // of 0 - -errorEnd: - stbi__rewind(s); - return res; -} - -// read 16bit value and convert to 24bit RGB -static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc *out) { - stbi__uint16 px = (stbi__uint16)stbi__get16le(s); - stbi__uint16 fiveBitMask = 31; - // we have 3 channels with 5bits each - int r = (px >> 10) & fiveBitMask; - int g = (px >> 5) & fiveBitMask; - int b = px & fiveBitMask; - // Note that this saves the data in RGB(A) order, so it doesn't need to be - // swapped later - out[0] = (stbi_uc)((r * 255) / 31); - out[1] = (stbi_uc)((g * 255) / 31); - out[2] = (stbi_uc)((b * 255) / 31); - - // some people claim that the most significant bit might be used for alpha - // (possibly if an alpha-bit is set in the "image descriptor byte") - // but that only made 16bit test images completely translucent.. - // so let's treat all 15 and 16bit TGAs as RGB with no alpha. -} - -static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - // read in the TGA header stuff - int tga_offset = stbi__get8(s); - int tga_indexed = stbi__get8(s); - int tga_image_type = stbi__get8(s); - int tga_is_RLE = 0; - int tga_palette_start = stbi__get16le(s); - int tga_palette_len = stbi__get16le(s); - int tga_palette_bits = stbi__get8(s); - int tga_x_origin = stbi__get16le(s); - int tga_y_origin = stbi__get16le(s); - int tga_width = stbi__get16le(s); - int tga_height = stbi__get16le(s); - int tga_bits_per_pixel = stbi__get8(s); - int tga_comp, tga_rgb16 = 0; - int tga_inverted = stbi__get8(s); - // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused - // (useless?) - // image data - unsigned char *tga_data; - unsigned char *tga_palette = NULL; - int i, j; - unsigned char raw_data[4] = {0}; - int RLE_count = 0; - int RLE_repeating = 0; - int read_next_pixel = 1; - STBI_NOTUSED(ri); - STBI_NOTUSED(tga_x_origin); // @TODO - STBI_NOTUSED(tga_y_origin); // @TODO - - // do a tiny bit of precessing - if (tga_image_type >= 8) { - tga_image_type -= 8; - tga_is_RLE = 1; - } - tga_inverted = 1 - ((tga_inverted >> 5) & 1); - - // If I'm paletted, then I'll use the number of bits from the palette - if (tga_indexed) - tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16); - else - tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), - &tga_rgb16); - - if (!tga_comp) // shouldn't really happen, stbi__tga_test() should have - // ensured basic consistency - return stbi__errpuc("bad format", "Can't find out TGA pixelformat"); - - // tga info - *x = tga_width; - *y = tga_height; - if (comp) - *comp = tga_comp; - - if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0)) - return stbi__errpuc("too large", "Corrupt TGA"); - - tga_data = - (unsigned char *)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0); - if (!tga_data) - return stbi__errpuc("outofmem", "Out of memory"); - - // skip to the data's starting position (offset usually = 0) - stbi__skip(s, tga_offset); - - if (!tga_indexed && !tga_is_RLE && !tga_rgb16) { - for (i = 0; i < tga_height; ++i) { - int row = tga_inverted ? tga_height - i - 1 : i; - stbi_uc *tga_row = tga_data + row * tga_width * tga_comp; - stbi__getn(s, tga_row, tga_width * tga_comp); - } - } else { - // do I need to load a palette? - if (tga_indexed) { - // any data to skip? (offset usually = 0) - stbi__skip(s, tga_palette_start); - // load the palette - tga_palette = - (unsigned char *)stbi__malloc_mad2(tga_palette_len, tga_comp, 0); - if (!tga_palette) { - STBI_FREE(tga_data); - return stbi__errpuc("outofmem", "Out of memory"); - } - if (tga_rgb16) { - stbi_uc *pal_entry = tga_palette; - STBI_ASSERT(tga_comp == STBI_rgb); - for (i = 0; i < tga_palette_len; ++i) { - stbi__tga_read_rgb16(s, pal_entry); - pal_entry += tga_comp; - } - } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) { - STBI_FREE(tga_data); - STBI_FREE(tga_palette); - return stbi__errpuc("bad palette", "Corrupt TGA"); - } - } - // load the data - for (i = 0; i < tga_width * tga_height; ++i) { - // if I'm in RLE mode, do I need to get a RLE stbi__pngchunk? - if (tga_is_RLE) { - if (RLE_count == 0) { - // yep, get the next byte as a RLE command - int RLE_cmd = stbi__get8(s); - RLE_count = 1 + (RLE_cmd & 127); - RLE_repeating = RLE_cmd >> 7; - read_next_pixel = 1; - } else if (!RLE_repeating) { - read_next_pixel = 1; - } - } else { - read_next_pixel = 1; - } - // OK, if I need to read a pixel, do it now - if (read_next_pixel) { - // load however much data we did have - if (tga_indexed) { - // read in index, then perform the lookup - int pal_idx = - (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s); - if (pal_idx >= tga_palette_len) { - // invalid index - pal_idx = 0; - } - pal_idx *= tga_comp; - for (j = 0; j < tga_comp; ++j) { - raw_data[j] = tga_palette[pal_idx + j]; - } - } else if (tga_rgb16) { - STBI_ASSERT(tga_comp == STBI_rgb); - stbi__tga_read_rgb16(s, raw_data); - } else { - // read in the data raw - for (j = 0; j < tga_comp; ++j) { - raw_data[j] = stbi__get8(s); - } - } - // clear the reading flag for the next pixel - read_next_pixel = 0; - } // end of reading a pixel - - // copy data - for (j = 0; j < tga_comp; ++j) - tga_data[i * tga_comp + j] = raw_data[j]; - - // in case we're in RLE mode, keep counting down - --RLE_count; - } - // do I need to invert the image? - if (tga_inverted) { - for (j = 0; j * 2 < tga_height; ++j) { - int index1 = j * tga_width * tga_comp; - int index2 = (tga_height - 1 - j) * tga_width * tga_comp; - for (i = tga_width * tga_comp; i > 0; --i) { - unsigned char temp = tga_data[index1]; - tga_data[index1] = tga_data[index2]; - tga_data[index2] = temp; - ++index1; - ++index2; - } - } - } - // clear my palette, if I had one - if (tga_palette != NULL) { - STBI_FREE(tga_palette); - } - } - - // swap RGB - if the source data was RGB16, it already is in the right order - if (tga_comp >= 3 && !tga_rgb16) { - unsigned char *tga_pixel = tga_data; - for (i = 0; i < tga_width * tga_height; ++i) { - unsigned char temp = tga_pixel[0]; - tga_pixel[0] = tga_pixel[2]; - tga_pixel[2] = temp; - tga_pixel += tga_comp; - } - } - - // convert to target component count - if (req_comp && req_comp != tga_comp) - tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, - tga_height); - - // the things I do to get rid of an error message, and yet keep - // Microsoft's C compilers happy... [8^( - tga_palette_start = tga_palette_len = tga_palette_bits = tga_x_origin = - tga_y_origin = 0; - STBI_NOTUSED(tga_palette_start); - // OK, done - return tga_data; -} -#endif - -// ************************************************************************************************* -// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, -// tweaked by STB - -#ifndef STBI_NO_PSD -static int stbi__psd_test(stbi__context *s) { - int r = (stbi__get32be(s) == 0x38425053); - stbi__rewind(s); - return r; -} - -static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount) { - int count, nleft, len; - - count = 0; - while ((nleft = pixelCount - count) > 0) { - len = stbi__get8(s); - if (len == 128) { - // No-op. - } else if (len < 128) { - // Copy next len+1 bytes literally. - len++; - if (len > nleft) - return 0; // corrupt data - count += len; - while (len) { - *p = stbi__get8(s); - p += 4; - len--; - } - } else if (len > 128) { - stbi_uc val; - // Next -len+1 bytes in the dest are replicated from next source byte. - // (Interpret len as a negative 8-bit int.) - len = 257 - len; - if (len > nleft) - return 0; // corrupt data - val = stbi__get8(s); - count += len; - while (len) { - *p = val; - p += 4; - len--; - } - } - } - - return 1; -} - -static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri, int bpc) { - int pixelCount; - int channelCount, compression; - int channel, i; - int bitdepth; - int w, h; - stbi_uc *out; - STBI_NOTUSED(ri); - - // Check identifier - if (stbi__get32be(s) != 0x38425053) // "8BPS" - return stbi__errpuc("not PSD", "Corrupt PSD image"); - - // Check file type version. - if (stbi__get16be(s) != 1) - return stbi__errpuc("wrong version", "Unsupported version of PSD image"); - - // Skip 6 reserved bytes. - stbi__skip(s, 6); - - // Read the number of channels (R, G, B, A, etc). - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) - return stbi__errpuc("wrong channel count", - "Unsupported number of channels in PSD image"); - - // Read the rows and columns of the image. - h = stbi__get32be(s); - w = stbi__get32be(s); - - // Make sure the depth is 8 bits. - bitdepth = stbi__get16be(s); - if (bitdepth != 8 && bitdepth != 16) - return stbi__errpuc("unsupported bit depth", - "PSD bit depth is not 8 or 16 bit"); - - // Make sure the color mode is RGB. - // Valid options are: - // 0: Bitmap - // 1: Grayscale - // 2: Indexed color - // 3: RGB color - // 4: CMYK color - // 7: Multichannel - // 8: Duotone - // 9: Lab color - if (stbi__get16be(s) != 3) - return stbi__errpuc("wrong color format", "PSD is not in RGB color format"); - - // Skip the Mode Data. (It's the palette for indexed color; other info for - // other modes.) - stbi__skip(s, stbi__get32be(s)); - - // Skip the image resources. (resolution, pen tool paths, etc) - stbi__skip(s, stbi__get32be(s)); - - // Skip the reserved data. - stbi__skip(s, stbi__get32be(s)); - - // Find out if the data is compressed. - // Known values: - // 0: no compression - // 1: RLE compressed - compression = stbi__get16be(s); - if (compression > 1) - return stbi__errpuc("bad compression", - "PSD has an unknown compression format"); - - // Check size - if (!stbi__mad3sizes_valid(4, w, h, 0)) - return stbi__errpuc("too large", "Corrupt PSD"); - - // Create the destination image. - - if (!compression && bitdepth == 16 && bpc == 16) { - out = (stbi_uc *)stbi__malloc_mad3(8, w, h, 0); - ri->bits_per_channel = 16; - } else - out = (stbi_uc *)stbi__malloc(4 * w * h); - - if (!out) - return stbi__errpuc("outofmem", "Out of memory"); - pixelCount = w * h; - - // Initialize the data to zero. - // memset( out, 0, pixelCount * 4 ); - - // Finally, the image data. - if (compression) { - // RLE as used by .PSD and .TIFF - // Loop until you get the number of unpacked bytes you are expecting: - // Read the next source byte into n. - // If n is between 0 and 127 inclusive, copy the next n+1 bytes - // literally. Else if n is between -127 and -1 inclusive, copy the next - // byte -n+1 times. Else if n is 128, noop. - // Endloop - - // The RLE-compressed data is preceded by a 2-byte data count for each row - // in the data, which we're going to just skip. - stbi__skip(s, h * channelCount * 2); - - // Read the RLE data by channel. - for (channel = 0; channel < 4; channel++) { - stbi_uc *p; - - p = out + channel; - if (channel >= channelCount) { - // Fill this channel with default data. - for (i = 0; i < pixelCount; i++, p += 4) - *p = (channel == 3 ? 255 : 0); - } else { - // Read the RLE data. - if (!stbi__psd_decode_rle(s, p, pixelCount)) { - STBI_FREE(out); - return stbi__errpuc("corrupt", "bad RLE data"); - } - } - } - - } else { - // We're at the raw image data. It's each channel in order (Red, Green, - // Blue, Alpha, ...) where each channel consists of an 8-bit (or 16-bit) - // value for each pixel in the image. - - // Read the data by channel. - for (channel = 0; channel < 4; channel++) { - if (channel >= channelCount) { - // Fill this channel with default data. - if (bitdepth == 16 && bpc == 16) { - stbi__uint16 *q = ((stbi__uint16 *)out) + channel; - stbi__uint16 val = channel == 3 ? 65535 : 0; - for (i = 0; i < pixelCount; i++, q += 4) - *q = val; - } else { - stbi_uc *p = out + channel; - stbi_uc val = channel == 3 ? 255 : 0; - for (i = 0; i < pixelCount; i++, p += 4) - *p = val; - } - } else { - if (ri->bits_per_channel == 16) { // output bpc - stbi__uint16 *q = ((stbi__uint16 *)out) + channel; - for (i = 0; i < pixelCount; i++, q += 4) - *q = (stbi__uint16)stbi__get16be(s); - } else { - stbi_uc *p = out + channel; - if (bitdepth == 16) { // input bpc - for (i = 0; i < pixelCount; i++, p += 4) - *p = (stbi_uc)(stbi__get16be(s) >> 8); - } else { - for (i = 0; i < pixelCount; i++, p += 4) - *p = stbi__get8(s); - } - } - } - } - } - - // remove weird white matte from PSD - if (channelCount >= 4) { - if (ri->bits_per_channel == 16) { - for (i = 0; i < w * h; ++i) { - stbi__uint16 *pixel = (stbi__uint16 *)out + 4 * i; - if (pixel[3] != 0 && pixel[3] != 65535) { - float a = pixel[3] / 65535.0f; - float ra = 1.0f / a; - float inv_a = 65535.0f * (1 - ra); - pixel[0] = (stbi__uint16)(pixel[0] * ra + inv_a); - pixel[1] = (stbi__uint16)(pixel[1] * ra + inv_a); - pixel[2] = (stbi__uint16)(pixel[2] * ra + inv_a); - } - } - } else { - for (i = 0; i < w * h; ++i) { - unsigned char *pixel = out + 4 * i; - if (pixel[3] != 0 && pixel[3] != 255) { - float a = pixel[3] / 255.0f; - float ra = 1.0f / a; - float inv_a = 255.0f * (1 - ra); - pixel[0] = (unsigned char)(pixel[0] * ra + inv_a); - pixel[1] = (unsigned char)(pixel[1] * ra + inv_a); - pixel[2] = (unsigned char)(pixel[2] * ra + inv_a); - } - } - } - } - - // convert to desired output format - if (req_comp && req_comp != 4) { - if (ri->bits_per_channel == 16) - out = (stbi_uc *)stbi__convert_format16((stbi__uint16 *)out, 4, req_comp, - w, h); - else - out = stbi__convert_format(out, 4, req_comp, w, h); - if (out == NULL) - return out; // stbi__convert_format frees input on failure - } - - if (comp) - *comp = 4; - *y = h; - *x = w; - - return out; -} -#endif - -// ************************************************************************************************* -// Softimage PIC loader -// by Tom Seddon -// -// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format -// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/ - -#ifndef STBI_NO_PIC -static int stbi__pic_is4(stbi__context *s, const char *str) { - int i; - for (i = 0; i < 4; ++i) - if (stbi__get8(s) != (stbi_uc)str[i]) - return 0; - - return 1; -} - -static int stbi__pic_test_core(stbi__context *s) { - int i; - - if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) - return 0; - - for (i = 0; i < 84; ++i) - stbi__get8(s); - - if (!stbi__pic_is4(s, "PICT")) - return 0; - - return 1; -} - -typedef struct { - stbi_uc size, type, channel; -} stbi__pic_packet; - -static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest) { - int mask = 0x80, i; - - for (i = 0; i < 4; ++i, mask >>= 1) { - if (channel & mask) { - if (stbi__at_eof(s)) - return stbi__errpuc("bad file", "PIC file too short"); - dest[i] = stbi__get8(s); - } - } - - return dest; -} - -static void stbi__copyval(int channel, stbi_uc *dest, const stbi_uc *src) { - int mask = 0x80, i; - - for (i = 0; i < 4; ++i, mask >>= 1) - if (channel & mask) - dest[i] = src[i]; -} - -static stbi_uc *stbi__pic_load_core(stbi__context *s, int width, int height, - int *comp, stbi_uc *result) { - int act_comp = 0, num_packets = 0, y, chained; - stbi__pic_packet packets[10]; - - // this will (should...) cater for even some bizarre stuff like having data - // for the same channel in multiple packets. - do { - stbi__pic_packet *packet; - - if (num_packets == sizeof(packets) / sizeof(packets[0])) - return stbi__errpuc("bad format", "too many packets"); - - packet = &packets[num_packets++]; - - chained = stbi__get8(s); - packet->size = stbi__get8(s); - packet->type = stbi__get8(s); - packet->channel = stbi__get8(s); - - act_comp |= packet->channel; - - if (stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (reading packets)"); - if (packet->size != 8) - return stbi__errpuc("bad format", "packet isn't 8bpp"); - } while (chained); - - *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel? - - for (y = 0; y < height; ++y) { - int packet_idx; - - for (packet_idx = 0; packet_idx < num_packets; ++packet_idx) { - stbi__pic_packet *packet = &packets[packet_idx]; - stbi_uc *dest = result + y * width * 4; - - switch (packet->type) { - default: - return stbi__errpuc("bad format", "packet has bad compression type"); - - case 0: { // uncompressed - int x; - - for (x = 0; x < width; ++x, dest += 4) - if (!stbi__readval(s, packet->channel, dest)) - return 0; - break; - } - - case 1: // Pure RLE - { - int left = width, i; - - while (left > 0) { - stbi_uc count, value[4]; - - count = stbi__get8(s); - if (stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (pure read count)"); - - if (count > left) - count = (stbi_uc)left; - - if (!stbi__readval(s, packet->channel, value)) - return 0; - - for (i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - left -= count; - } - } break; - - case 2: { // Mixed RLE - int left = width; - while (left > 0) { - int count = stbi__get8(s), i; - if (stbi__at_eof(s)) - return stbi__errpuc("bad file", - "file too short (mixed read count)"); - - if (count >= 128) { // Repeated - stbi_uc value[4]; - - if (count == 128) - count = stbi__get16be(s); - else - count -= 127; - if (count > left) - return stbi__errpuc("bad file", "scanline overrun"); - - if (!stbi__readval(s, packet->channel, value)) - return 0; - - for (i = 0; i < count; ++i, dest += 4) - stbi__copyval(packet->channel, dest, value); - } else { // Raw - ++count; - if (count > left) - return stbi__errpuc("bad file", "scanline overrun"); - - for (i = 0; i < count; ++i, dest += 4) - if (!stbi__readval(s, packet->channel, dest)) - return 0; - } - left -= count; - } - break; - } - } - } - } - - return result; -} - -static void *stbi__pic_load(stbi__context *s, int *px, int *py, int *comp, - int req_comp, stbi__result_info *ri) { - stbi_uc *result; - int i, x, y, internal_comp; - STBI_NOTUSED(ri); - - if (!comp) - comp = &internal_comp; - - for (i = 0; i < 92; ++i) - stbi__get8(s); - - x = stbi__get16be(s); - y = stbi__get16be(s); - if (stbi__at_eof(s)) - return stbi__errpuc("bad file", "file too short (pic header)"); - if (!stbi__mad3sizes_valid(x, y, 4, 0)) - return stbi__errpuc("too large", "PIC image too large to decode"); - - stbi__get32be(s); // skip `ratio' - stbi__get16be(s); // skip `fields' - stbi__get16be(s); // skip `pad' - - // intermediate buffer is RGBA - result = (stbi_uc *)stbi__malloc_mad3(x, y, 4, 0); - memset(result, 0xff, x * y * 4); - - if (!stbi__pic_load_core(s, x, y, comp, result)) { - STBI_FREE(result); - result = 0; - } - *px = x; - *py = y; - if (req_comp == 0) - req_comp = *comp; - result = stbi__convert_format(result, 4, req_comp, x, y); - - return result; -} - -static int stbi__pic_test(stbi__context *s) { - int r = stbi__pic_test_core(s); - stbi__rewind(s); - return r; -} -#endif - -// ************************************************************************************************* -// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb - -#ifndef STBI_NO_GIF -typedef struct { - stbi__int16 prefix; - stbi_uc first; - stbi_uc suffix; -} stbi__gif_lzw; - -typedef struct { - int w, h; - stbi_uc *out; // output buffer (always 4 components) - stbi_uc *background; // The current "background" as far as a gif is concerned - stbi_uc *history; - int flags, bgindex, ratio, transparent, eflags; - stbi_uc pal[256][4]; - stbi_uc lpal[256][4]; - stbi__gif_lzw codes[8192]; - stbi_uc *color_table; - int parse, step; - int lflags; - int start_x, start_y; - int max_x, max_y; - int cur_x, cur_y; - int line_size; - int delay; -} stbi__gif; - -static int stbi__gif_test_raw(stbi__context *s) { - int sz; - if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || - stbi__get8(s) != '8') - return 0; - sz = stbi__get8(s); - if (sz != '9' && sz != '7') - return 0; - if (stbi__get8(s) != 'a') - return 0; - return 1; -} - -static int stbi__gif_test(stbi__context *s) { - int r = stbi__gif_test_raw(s); - stbi__rewind(s); - return r; -} - -static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], - int num_entries, int transp) { - int i; - for (i = 0; i < num_entries; ++i) { - pal[i][2] = stbi__get8(s); - pal[i][1] = stbi__get8(s); - pal[i][0] = stbi__get8(s); - pal[i][3] = transp == i ? 0 : 255; - } -} - -static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, - int is_info) { - stbi_uc version; - if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || - stbi__get8(s) != '8') - return stbi__err("not GIF", "Corrupt GIF"); - - version = stbi__get8(s); - if (version != '7' && version != '9') - return stbi__err("not GIF", "Corrupt GIF"); - if (stbi__get8(s) != 'a') - return stbi__err("not GIF", "Corrupt GIF"); - - stbi__g_failure_reason = ""; - g->w = stbi__get16le(s); - g->h = stbi__get16le(s); - g->flags = stbi__get8(s); - g->bgindex = stbi__get8(s); - g->ratio = stbi__get8(s); - g->transparent = -1; - - if (comp != 0) - *comp = 4; // can't actually tell whether it's 3 or 4 until we parse the - // comments - - if (is_info) - return 1; - - if (g->flags & 0x80) - stbi__gif_parse_colortable(s, g->pal, 2 << (g->flags & 7), -1); - - return 1; -} - -static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) { - stbi__gif *g = (stbi__gif *)stbi__malloc(sizeof(stbi__gif)); - if (!stbi__gif_header(s, g, comp, 1)) { - STBI_FREE(g); - stbi__rewind(s); - return 0; - } - if (x) - *x = g->w; - if (y) - *y = g->h; - STBI_FREE(g); - return 1; -} - -static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code) { - stbi_uc *p, *c; - int idx; - - // recurse to decode the prefixes, since the linked-list is backwards, - // and working backwards through an interleaved image would be nasty - if (g->codes[code].prefix >= 0) - stbi__out_gif_code(g, g->codes[code].prefix); - - if (g->cur_y >= g->max_y) - return; - - idx = g->cur_x + g->cur_y; - p = &g->out[idx]; - g->history[idx / 4] = 1; - - c = &g->color_table[g->codes[code].suffix * 4]; - if (c[3] > 128) { // don't render transparent pixels; - p[0] = c[2]; - p[1] = c[1]; - p[2] = c[0]; - p[3] = c[3]; - } - g->cur_x += 4; - - if (g->cur_x >= g->max_x) { - g->cur_x = g->start_x; - g->cur_y += g->step; - - while (g->cur_y >= g->max_y && g->parse > 0) { - g->step = (1 << g->parse) * g->line_size; - g->cur_y = g->start_y + (g->step >> 1); - --g->parse; - } - } -} - -static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g) { - stbi_uc lzw_cs; - stbi__int32 len, init_code; - stbi__uint32 first; - stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear; - stbi__gif_lzw *p; - - lzw_cs = stbi__get8(s); - if (lzw_cs > 12) - return NULL; - clear = 1 << lzw_cs; - first = 1; - codesize = lzw_cs + 1; - codemask = (1 << codesize) - 1; - bits = 0; - valid_bits = 0; - for (init_code = 0; init_code < clear; init_code++) { - g->codes[init_code].prefix = -1; - g->codes[init_code].first = (stbi_uc)init_code; - g->codes[init_code].suffix = (stbi_uc)init_code; - } - - // support no starting clear code - avail = clear + 2; - oldcode = -1; - - len = 0; - for (;;) { - if (valid_bits < codesize) { - if (len == 0) { - len = stbi__get8(s); // start new block - if (len == 0) - return g->out; - } - --len; - bits |= (stbi__int32)stbi__get8(s) << valid_bits; - valid_bits += 8; - } else { - stbi__int32 code = bits & codemask; - bits >>= codesize; - valid_bits -= codesize; - // @OPTIMIZE: is there some way we can accelerate the non-clear path? - if (code == clear) { // clear code - codesize = lzw_cs + 1; - codemask = (1 << codesize) - 1; - avail = clear + 2; - oldcode = -1; - first = 0; - } else if (code == clear + 1) { // end of stream code - stbi__skip(s, len); - while ((len = stbi__get8(s)) > 0) - stbi__skip(s, len); - return g->out; - } else if (code <= avail) { - if (first) { - return stbi__errpuc("no clear code", "Corrupt GIF"); - } - - if (oldcode >= 0) { - p = &g->codes[avail++]; - if (avail > 8192) { - return stbi__errpuc("too many codes", "Corrupt GIF"); - } - - p->prefix = (stbi__int16)oldcode; - p->first = g->codes[oldcode].first; - p->suffix = (code == avail) ? p->first : g->codes[code].first; - } else if (code == avail) - return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - - stbi__out_gif_code(g, (stbi__uint16)code); - - if ((avail & codemask) == 0 && avail <= 0x0FFF) { - codesize++; - codemask = (1 << codesize) - 1; - } - - oldcode = code; - } else { - return stbi__errpuc("illegal code in raster", "Corrupt GIF"); - } - } - } -} - -// this function is designed to support animated gifs, although stb_image -// doesn't support it two back is the image from two frames ago, used for a very -// specific disposal format -static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, - int req_comp, stbi_uc *two_back) { - int dispose; - int first_frame; - int pi; - int pcount; - STBI_NOTUSED(req_comp); - - // on first frame, any non-written pixels get the background colour - // (non-transparent) - first_frame = 0; - if (g->out == 0) { - if (!stbi__gif_header(s, g, comp, 0)) - return 0; // stbi__g_failure_reason set by stbi__gif_header - if (!stbi__mad3sizes_valid(4, g->w, g->h, 0)) - return stbi__errpuc("too large", "GIF image is too large"); - pcount = g->w * g->h; - g->out = (stbi_uc *)stbi__malloc(4 * pcount); - g->background = (stbi_uc *)stbi__malloc(4 * pcount); - g->history = (stbi_uc *)stbi__malloc(pcount); - if (!g->out || !g->background || !g->history) - return stbi__errpuc("outofmem", "Out of memory"); - - // image is treated as "transparent" at the start - ie, nothing overwrites - // the current background; background colour is only used for pixels that - // are not rendered first frame, after that "background" color refers to the - // color that was there the previous frame. - memset(g->out, 0x00, 4 * pcount); - memset(g->background, 0x00, - 4 * pcount); // state of the background (starts transparent) - memset(g->history, 0x00, - pcount); // pixels that were affected previous frame - first_frame = 1; - } else { - // second frame - how do we dispoase of the previous one? - dispose = (g->eflags & 0x1C) >> 2; - pcount = g->w * g->h; - - if ((dispose == 3) && (two_back == 0)) { - dispose = 2; // if I don't have an image to revert back to, default to the - // old background - } - - if (dispose == 3) { // use previous graphic - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi]) { - memcpy(&g->out[pi * 4], &two_back[pi * 4], 4); - } - } - } else if (dispose == 2) { - // restore what was changed last frame to background before that frame; - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi]) { - memcpy(&g->out[pi * 4], &g->background[pi * 4], 4); - } - } - } else { - // This is a non-disposal case eithe way, so just - // leave the pixels as is, and they will become the new background - // 1: do not dispose - // 0: not specified. - } - - // background is what out is after the undoing of the previou frame; - memcpy(g->background, g->out, 4 * g->w * g->h); - } - - // clear my history; - memset(g->history, 0x00, - g->w * g->h); // pixels that were affected previous frame - - for (;;) { - int tag = stbi__get8(s); - switch (tag) { - case 0x2C: /* Image Descriptor */ - { - stbi__int32 x, y, w, h; - stbi_uc *o; - - x = stbi__get16le(s); - y = stbi__get16le(s); - w = stbi__get16le(s); - h = stbi__get16le(s); - if (((x + w) > (g->w)) || ((y + h) > (g->h))) - return stbi__errpuc("bad Image Descriptor", "Corrupt GIF"); - - g->line_size = g->w * 4; - g->start_x = x * 4; - g->start_y = y * g->line_size; - g->max_x = g->start_x + w * 4; - g->max_y = g->start_y + h * g->line_size; - g->cur_x = g->start_x; - g->cur_y = g->start_y; - - // if the width of the specified rectangle is 0, that means - // we may not see *any* pixels or the image is malformed; - // to make sure this is caught, move the current y down to - // max_y (which is what out_gif_code checks). - if (w == 0) - g->cur_y = g->max_y; - - g->lflags = stbi__get8(s); - - if (g->lflags & 0x40) { - g->step = 8 * g->line_size; // first interlaced spacing - g->parse = 3; - } else { - g->step = g->line_size; - g->parse = 0; - } - - if (g->lflags & 0x80) { - stbi__gif_parse_colortable(s, g->lpal, 2 << (g->lflags & 7), - g->eflags & 0x01 ? g->transparent : -1); - g->color_table = (stbi_uc *)g->lpal; - } else if (g->flags & 0x80) { - g->color_table = (stbi_uc *)g->pal; - } else - return stbi__errpuc("missing color table", "Corrupt GIF"); - - o = stbi__process_gif_raster(s, g); - if (!o) - return NULL; - - // if this was the first frame, - pcount = g->w * g->h; - if (first_frame && (g->bgindex > 0)) { - // if first frame, any pixel not drawn to gets the background color - for (pi = 0; pi < pcount; ++pi) { - if (g->history[pi] == 0) { - g->pal[g->bgindex][3] = - 255; // just in case it was made transparent, undo that; It will - // be reset next frame if need be; - memcpy(&g->out[pi * 4], &g->pal[g->bgindex], 4); - } - } - } - - return o; - } - - case 0x21: // Comment Extension. - { - int len; - int ext = stbi__get8(s); - if (ext == 0xF9) { // Graphic Control Extension. - len = stbi__get8(s); - if (len == 4) { - g->eflags = stbi__get8(s); - g->delay = - 10 * stbi__get16le( - s); // delay - 1/100th of a second, saving as 1/1000ths. - - // unset old transparent - if (g->transparent >= 0) { - g->pal[g->transparent][3] = 255; - } - if (g->eflags & 0x01) { - g->transparent = stbi__get8(s); - if (g->transparent >= 0) { - g->pal[g->transparent][3] = 0; - } - } else { - // don't need transparent - stbi__skip(s, 1); - g->transparent = -1; - } - } else { - stbi__skip(s, len); - break; - } - } - while ((len = stbi__get8(s)) != 0) { - stbi__skip(s, len); - } - break; - } - - case 0x3B: // gif stream termination code - return (stbi_uc *)s; // using '1' causes warning on some compilers - - default: - return stbi__errpuc("unknown code", "Corrupt GIF"); - } - } -} - -static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, - int *z, int *comp, int req_comp) { - if (stbi__gif_test(s)) { - int layers = 0; - stbi_uc *u = 0; - stbi_uc *out = 0; - stbi_uc *two_back = 0; - stbi__gif g; - int stride; - memset(&g, 0, sizeof(g)); - if (delays) { - *delays = 0; - } - - do { - u = stbi__gif_load_next(s, &g, comp, req_comp, two_back); - if (u == (stbi_uc *)s) - u = 0; // end of animated gif marker - - if (u) { - *x = g.w; - *y = g.h; - ++layers; - stride = g.w * g.h * 4; - - if (out) { - out = (stbi_uc *)STBI_REALLOC(out, layers * stride); - if (delays) { - *delays = (int *)STBI_REALLOC(*delays, sizeof(int) * layers); - } - } else { - out = (stbi_uc *)stbi__malloc(layers * stride); - if (delays) { - *delays = (int *)stbi__malloc(layers * sizeof(int)); - } - } - memcpy(out + ((layers - 1) * stride), u, stride); - if (layers >= 2) { - two_back = out - 2 * stride; - } - - if (delays) { - (*delays)[layers - 1U] = g.delay; - } - } - } while (u != 0); - - // free temp buffer; - STBI_FREE(g.out); - STBI_FREE(g.history); - STBI_FREE(g.background); - - // do the final conversion after loading everything; - if (req_comp && req_comp != 4) - out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h); - - *z = layers; - return out; - } else { - return stbi__errpuc("not GIF", "Image was not as a gif type."); - } -} - -static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - stbi_uc *u = 0; - stbi__gif g; - memset(&g, 0, sizeof(g)); - STBI_NOTUSED(ri); - - u = stbi__gif_load_next(s, &g, comp, req_comp, 0); - if (u == (stbi_uc *)s) - u = 0; // end of animated gif marker - if (u) { - *x = g.w; - *y = g.h; - - // moved conversion to after successful load so that the same - // can be done for multiple frames. - if (req_comp && req_comp != 4) - u = stbi__convert_format(u, 4, req_comp, g.w, g.h); - } else if (g.out) { - // if there was an error and we allocated an image buffer, free it! - STBI_FREE(g.out); - } - - // free buffers needed for multiple frame loading; - STBI_FREE(g.history); - STBI_FREE(g.background); - - return u; -} - -static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp) { - return stbi__gif_info_raw(s, x, y, comp); -} -#endif - -// ************************************************************************************************* -// Radiance RGBE HDR loader -// originally by Nicolas Schulz -#ifndef STBI_NO_HDR -static int stbi__hdr_test_core(stbi__context *s, const char *signature) { - int i; - for (i = 0; signature[i]; ++i) - if (stbi__get8(s) != signature[i]) - return 0; - stbi__rewind(s); - return 1; -} - -static int stbi__hdr_test(stbi__context *s) { - int r = stbi__hdr_test_core(s, "#?RADIANCE\n"); - stbi__rewind(s); - if (!r) { - r = stbi__hdr_test_core(s, "#?RGBE\n"); - stbi__rewind(s); - } - return r; -} - -#define STBI__HDR_BUFLEN 1024 -static char *stbi__hdr_gettoken(stbi__context *z, char *buffer) { - int len = 0; - char c = '\0'; - - c = (char)stbi__get8(z); - - while (!stbi__at_eof(z) && c != '\n') { - buffer[len++] = c; - if (len == STBI__HDR_BUFLEN - 1) { - // flush to end of line - while (!stbi__at_eof(z) && stbi__get8(z) != '\n') - ; - break; - } - c = (char)stbi__get8(z); - } - - buffer[len] = 0; - return buffer; -} - -static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp) { - if (input[3] != 0) { - float f1; - // Exponent - f1 = (float)ldexp(1.0f, input[3] - (int)(128 + 8)); - if (req_comp <= 2) - output[0] = (input[0] + input[1] + input[2]) * f1 / 3; - else { - output[0] = input[0] * f1; - output[1] = input[1] * f1; - output[2] = input[2] * f1; - } - if (req_comp == 2) - output[1] = 1; - if (req_comp == 4) - output[3] = 1; - } else { - switch (req_comp) { - case 4: - output[3] = 1; /* fallthrough */ - case 3: - output[0] = output[1] = output[2] = 0; - break; - case 2: - output[1] = 1; /* fallthrough */ - case 1: - output[0] = 0; - break; - } - } -} - -static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - char buffer[STBI__HDR_BUFLEN]; - char *token; - int valid = 0; - int width, height; - stbi_uc *scanline; - float *hdr_data; - int len; - unsigned char count, value; - int i, j, k, c1, c2, z; - const char *headerToken; - STBI_NOTUSED(ri); - - // Check identifier - headerToken = stbi__hdr_gettoken(s, buffer); - if (strcmp(headerToken, "#?RADIANCE") != 0 && - strcmp(headerToken, "#?RGBE") != 0) - return stbi__errpf("not HDR", "Corrupt HDR image"); - - // Parse header - for (;;) { - token = stbi__hdr_gettoken(s, buffer); - if (token[0] == 0) - break; - if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) - valid = 1; - } - - if (!valid) - return stbi__errpf("unsupported format", "Unsupported HDR format"); - - // Parse width and height - // can't use sscanf() if we're not using stdio! - token = stbi__hdr_gettoken(s, buffer); - if (strncmp(token, "-Y ", 3)) - return stbi__errpf("unsupported data layout", "Unsupported HDR format"); - token += 3; - height = (int)strtol(token, &token, 10); - while (*token == ' ') - ++token; - if (strncmp(token, "+X ", 3)) - return stbi__errpf("unsupported data layout", "Unsupported HDR format"); - token += 3; - width = (int)strtol(token, NULL, 10); - - *x = width; - *y = height; - - if (comp) - *comp = 3; - if (req_comp == 0) - req_comp = 3; - - if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0)) - return stbi__errpf("too large", "HDR image is too large"); - - // Read data - hdr_data = - (float *)stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0); - if (!hdr_data) - return stbi__errpf("outofmem", "Out of memory"); - - // Load image data - // image data is stored as some number of sca - if (width < 8 || width >= 32768) { - // Read flat data - for (j = 0; j < height; ++j) { - for (i = 0; i < width; ++i) { - stbi_uc rgbe[4]; - main_decode_loop: - stbi__getn(s, rgbe, 4); - stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, - req_comp); - } - } - } else { - // Read RLE-encoded data - scanline = NULL; - - for (j = 0; j < height; ++j) { - c1 = stbi__get8(s); - c2 = stbi__get8(s); - len = stbi__get8(s); - if (c1 != 2 || c2 != 2 || (len & 0x80)) { - // not run-length encoded, so we have to actually use THIS data as a - // decoded pixel (note this can't be a valid pixel--one of RGB must be - // >= 128) - stbi_uc rgbe[4]; - rgbe[0] = (stbi_uc)c1; - rgbe[1] = (stbi_uc)c2; - rgbe[2] = (stbi_uc)len; - rgbe[3] = (stbi_uc)stbi__get8(s); - stbi__hdr_convert(hdr_data, rgbe, req_comp); - i = 1; - j = 0; - STBI_FREE(scanline); - goto main_decode_loop; // yes, this makes no sense - } - len <<= 8; - len |= stbi__get8(s); - if (len != width) { - STBI_FREE(hdr_data); - STBI_FREE(scanline); - return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); - } - if (scanline == NULL) { - scanline = (stbi_uc *)stbi__malloc_mad2(width, 4, 0); - if (!scanline) { - STBI_FREE(hdr_data); - return stbi__errpf("outofmem", "Out of memory"); - } - } - - for (k = 0; k < 4; ++k) { - int nleft; - i = 0; - while ((nleft = width - i) > 0) { - count = stbi__get8(s); - if (count > 128) { - // Run - value = stbi__get8(s); - count -= 128; - if (count > nleft) { - STBI_FREE(hdr_data); - STBI_FREE(scanline); - return stbi__errpf("corrupt", "bad RLE data in HDR"); - } - for (z = 0; z < count; ++z) - scanline[i++ * 4 + k] = value; - } else { - // Dump - if (count > nleft) { - STBI_FREE(hdr_data); - STBI_FREE(scanline); - return stbi__errpf("corrupt", "bad RLE data in HDR"); - } - for (z = 0; z < count; ++z) - scanline[i++ * 4 + k] = stbi__get8(s); - } - } - } - for (i = 0; i < width; ++i) - stbi__hdr_convert(hdr_data + (j * width + i) * req_comp, - scanline + i * 4, req_comp); - } - if (scanline) - STBI_FREE(scanline); - } - - return hdr_data; -} - -static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp) { - char buffer[STBI__HDR_BUFLEN]; - char *token; - int valid = 0; - int dummy; - - if (!x) - x = &dummy; - if (!y) - y = &dummy; - if (!comp) - comp = &dummy; - - if (stbi__hdr_test(s) == 0) { - stbi__rewind(s); - return 0; - } - - for (;;) { - token = stbi__hdr_gettoken(s, buffer); - if (token[0] == 0) - break; - if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) - valid = 1; - } - - if (!valid) { - stbi__rewind(s); - return 0; - } - token = stbi__hdr_gettoken(s, buffer); - if (strncmp(token, "-Y ", 3)) { - stbi__rewind(s); - return 0; - } - token += 3; - *y = (int)strtol(token, &token, 10); - while (*token == ' ') - ++token; - if (strncmp(token, "+X ", 3)) { - stbi__rewind(s); - return 0; - } - token += 3; - *x = (int)strtol(token, NULL, 10); - *comp = 3; - return 1; -} -#endif // STBI_NO_HDR - -#ifndef STBI_NO_BMP -static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) { - void *p; - stbi__bmp_data info; - - info.all_a = 255; - p = stbi__bmp_parse_header(s, &info); - stbi__rewind(s); - if (p == NULL) - return 0; - if (x) - *x = s->img_x; - if (y) - *y = s->img_y; - if (comp) { - if (info.bpp == 24 && info.ma == 0xff000000) - *comp = 3; - else - *comp = info.ma ? 4 : 3; - } - return 1; -} -#endif - -#ifndef STBI_NO_PSD -static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp) { - int channelCount, dummy, depth; - if (!x) - x = &dummy; - if (!y) - y = &dummy; - if (!comp) - comp = &dummy; - if (stbi__get32be(s) != 0x38425053) { - stbi__rewind(s); - return 0; - } - if (stbi__get16be(s) != 1) { - stbi__rewind(s); - return 0; - } - stbi__skip(s, 6); - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) { - stbi__rewind(s); - return 0; - } - *y = stbi__get32be(s); - *x = stbi__get32be(s); - depth = stbi__get16be(s); - if (depth != 8 && depth != 16) { - stbi__rewind(s); - return 0; - } - if (stbi__get16be(s) != 3) { - stbi__rewind(s); - return 0; - } - *comp = 4; - return 1; -} - -static int stbi__psd_is16(stbi__context *s) { - int channelCount, depth; - if (stbi__get32be(s) != 0x38425053) { - stbi__rewind(s); - return 0; - } - if (stbi__get16be(s) != 1) { - stbi__rewind(s); - return 0; - } - stbi__skip(s, 6); - channelCount = stbi__get16be(s); - if (channelCount < 0 || channelCount > 16) { - stbi__rewind(s); - return 0; - } - (void)stbi__get32be(s); - (void)stbi__get32be(s); - depth = stbi__get16be(s); - if (depth != 16) { - stbi__rewind(s); - return 0; - } - return 1; -} -#endif - -#ifndef STBI_NO_PIC -static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) { - int act_comp = 0, num_packets = 0, chained, dummy; - stbi__pic_packet packets[10]; - - if (!x) - x = &dummy; - if (!y) - y = &dummy; - if (!comp) - comp = &dummy; - - if (!stbi__pic_is4(s, "\x53\x80\xF6\x34")) { - stbi__rewind(s); - return 0; - } - - stbi__skip(s, 88); - - *x = stbi__get16be(s); - *y = stbi__get16be(s); - if (stbi__at_eof(s)) { - stbi__rewind(s); - return 0; - } - if ((*x) != 0 && (1 << 28) / (*x) < (*y)) { - stbi__rewind(s); - return 0; - } - - stbi__skip(s, 8); - - do { - stbi__pic_packet *packet; - - if (num_packets == sizeof(packets) / sizeof(packets[0])) - return 0; - - packet = &packets[num_packets++]; - chained = stbi__get8(s); - packet->size = stbi__get8(s); - packet->type = stbi__get8(s); - packet->channel = stbi__get8(s); - act_comp |= packet->channel; - - if (stbi__at_eof(s)) { - stbi__rewind(s); - return 0; - } - if (packet->size != 8) { - stbi__rewind(s); - return 0; - } - } while (chained); - - *comp = (act_comp & 0x10 ? 4 : 3); - - return 1; -} -#endif - -// ************************************************************************************************* -// Portable Gray Map and Portable Pixel Map loader -// by Ken Miller -// -// PGM: http://netpbm.sourceforge.net/doc/pgm.html -// PPM: http://netpbm.sourceforge.net/doc/ppm.html -// -// Known limitations: -// Does not support comments in the header section -// Does not support ASCII image data (formats P2 and P3) -// Does not support 16-bit-per-channel - -#ifndef STBI_NO_PNM - -static int stbi__pnm_test(stbi__context *s) { - char p, t; - p = (char)stbi__get8(s); - t = (char)stbi__get8(s); - if (p != 'P' || (t != '5' && t != '6')) { - stbi__rewind(s); - return 0; - } - return 1; -} - -static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, - int req_comp, stbi__result_info *ri) { - stbi_uc *out; - STBI_NOTUSED(ri); - - if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n)) - return 0; - - *x = s->img_x; - *y = s->img_y; - if (comp) - *comp = s->img_n; - - if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) - return stbi__errpuc("too large", "PNM too large"); - - out = (stbi_uc *)stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); - if (!out) - return stbi__errpuc("outofmem", "Out of memory"); - stbi__getn(s, out, s->img_n * s->img_x * s->img_y); - - if (req_comp && req_comp != s->img_n) { - out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); - if (out == NULL) - return out; // stbi__convert_format frees input on failure - } - return out; -} - -static int stbi__pnm_isspace(char c) { - return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || - c == '\r'; -} - -static void stbi__pnm_skip_whitespace(stbi__context *s, char *c) { - for (;;) { - while (!stbi__at_eof(s) && stbi__pnm_isspace(*c)) - *c = (char)stbi__get8(s); - - if (stbi__at_eof(s) || *c != '#') - break; - - while (!stbi__at_eof(s) && *c != '\n' && *c != '\r') - *c = (char)stbi__get8(s); - } -} - -static int stbi__pnm_isdigit(char c) { return c >= '0' && c <= '9'; } - -static int stbi__pnm_getinteger(stbi__context *s, char *c) { - int value = 0; - - while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { - value = value * 10 + (*c - '0'); - *c = (char)stbi__get8(s); - } - - return value; -} - -static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) { - int maxv, dummy; - char c, p, t; - - if (!x) - x = &dummy; - if (!y) - y = &dummy; - if (!comp) - comp = &dummy; - - stbi__rewind(s); - - // Get identifier - p = (char)stbi__get8(s); - t = (char)stbi__get8(s); - if (p != 'P' || (t != '5' && t != '6')) { - stbi__rewind(s); - return 0; - } - - *comp = - (t == '6') ? 3 : 1; // '5' is 1-component .pgm; '6' is 3-component .ppm - - c = (char)stbi__get8(s); - stbi__pnm_skip_whitespace(s, &c); - - *x = stbi__pnm_getinteger(s, &c); // read width - stbi__pnm_skip_whitespace(s, &c); - - *y = stbi__pnm_getinteger(s, &c); // read height - stbi__pnm_skip_whitespace(s, &c); - - maxv = stbi__pnm_getinteger(s, &c); // read max value - - if (maxv > 255) - return stbi__err("max value > 255", "PPM image not 8-bit"); - else - return 1; -} -#endif - -static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) { -#ifndef STBI_NO_JPEG - if (stbi__jpeg_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_PNG - if (stbi__png_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_GIF - if (stbi__gif_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_BMP - if (stbi__bmp_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_PSD - if (stbi__psd_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_PIC - if (stbi__pic_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_PNM - if (stbi__pnm_info(s, x, y, comp)) - return 1; -#endif - -#ifndef STBI_NO_HDR - if (stbi__hdr_info(s, x, y, comp)) - return 1; -#endif - -// test tga last because it's a crappy test! -#ifndef STBI_NO_TGA - if (stbi__tga_info(s, x, y, comp)) - return 1; -#endif - return stbi__err("unknown image type", - "Image not of any known type, or corrupt"); -} - -static int stbi__is_16_main(stbi__context *s) { -#ifndef STBI_NO_PNG - if (stbi__png_is16(s)) - return 1; -#endif - -#ifndef STBI_NO_PSD - if (stbi__psd_is16(s)) - return 1; -#endif - - return 0; -} - -#ifndef STBI_NO_STDIO -STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp) { - FILE *f = stbi__fopen(filename, "rb"); - int result; - if (!f) - return stbi__err("can't fopen", "Unable to open file"); - result = stbi_info_from_file(f, x, y, comp); - fclose(f); - return result; -} - -STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp) { - int r; - stbi__context s; - long pos = ftell(f); - stbi__start_file(&s, f); - r = stbi__info_main(&s, x, y, comp); - fseek(f, pos, SEEK_SET); - return r; -} - -STBIDEF int stbi_is_16_bit(char const *filename) { - FILE *f = stbi__fopen(filename, "rb"); - int result; - if (!f) - return stbi__err("can't fopen", "Unable to open file"); - result = stbi_is_16_bit_from_file(f); - fclose(f); - return result; -} - -STBIDEF int stbi_is_16_bit_from_file(FILE *f) { - int r; - stbi__context s; - long pos = ftell(f); - stbi__start_file(&s, f); - r = stbi__is_16_main(&s); - fseek(f, pos, SEEK_SET); - return r; -} -#endif // !STBI_NO_STDIO - -STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, - int *y, int *comp) { - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__info_main(&s, x, y, comp); -} - -STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, - int *x, int *y, int *comp) { - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user); - return stbi__info_main(&s, x, y, comp); -} - -STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len) { - stbi__context s; - stbi__start_mem(&s, buffer, len); - return stbi__is_16_main(&s); -} - -STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, - void *user) { - stbi__context s; - stbi__start_callbacks(&s, (stbi_io_callbacks *)c, user); - return stbi__is_16_main(&s); -} - -#endif // STB_IMAGE_IMPLEMENTATION - -/* - revision history: - 2.20 (2019-02-07) support utf8 filenames in Windows; fix warnings and - platform ifdefs 2.19 (2018-02-11) fix warning 2.18 (2018-01-30) fix - warnings 2.17 (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug - 1-bit BMP - *_is_16_bit api - avoid warnings - 2.16 (2017-07-23) all functions have 16-bit variants; - STBI_NO_STDIO works again; - compilation fixes; - fix rounding in unpremultiply; - optimize vertical flip; - disable raw_len validation; - documentation fixes - 2.15 (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode; - warning fixes; disable run-time SSE detection on gcc; - uniform handling of optional "return" values; - thread-safe initialization of zlib tables - 2.14 (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet - JPGs 2.13 (2016-11-29) add 16-bit API, only supported for PNG right now 2.12 - (2016-04-02) fix typo in 2.11 PSD fix that caused crashes 2.11 (2016-04-02) - allocate large structures on the stack remove white matting for transparent - PSD fix reported channel count for PNG & BMP re-enable SSE2 in non-gcc 64-bit - support RGB-formatted JPEG - read 16-bit PNGs (only as 8-bit) - 2.10 (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED - 2.09 (2016-01-16) allow comments in PNM files - 16-bit-per-pixel TGA (not bit-per-component) - info() for TGA could break due to .hdr handling - info() for BMP to shares code instead of sloppy parse - can use STBI_REALLOC_SIZED if allocator doesn't support - realloc code cleanup 2.08 (2015-09-13) fix to 2.07 cleanup, reading RGB PSD - as RGBA 2.07 (2015-09-13) fix compiler warnings partial animated GIF support - limited 16-bpc PSD support - #ifdef unused functions - bug with < 92 byte PIC,PNM,HDR,TGA - 2.06 (2015-04-19) fix bug where PSD returns wrong '*comp' value - 2.05 (2015-04-19) fix bug in progressive JPEG handling, fix warning - 2.04 (2015-04-15) try to re-enable SIMD on MinGW 64-bit - 2.03 (2015-04-12) extra corruption checking (mmozeiko) - stbi_set_flip_vertically_on_load (nguillemot) - fix NEON support; fix mingw support - 2.02 (2015-01-19) fix incorrect assert, fix warning - 2.01 (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit - without -msse2 2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG 2.00 - (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg) progressive - JPEG (stb) PGM/PPM support (Ken Miller) STBI_MALLOC,STBI_REALLOC,STBI_FREE - GIF bugfix -- seemingly never worked - STBI_NO_*, STBI_ONLY_* - 1.48 (2014-12-14) fix incorrectly-named assert() - 1.47 (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar - Cornut & stb) optimize PNG (ryg) fix bug in interlaced PNG with - user-specified channel count (stb) 1.46 (2014-08-26) fix broken tRNS chunk - (colorkey-style transparency) in non-paletted PNG 1.45 (2014-08-16) fix - MSVC-ARM internal compiler error by wrapping malloc 1.44 (2014-08-07) - various warning fixes from Ronny Chevalier - 1.43 (2014-07-15) - fix MSVC-only compiler problem in code changed in 1.42 - 1.42 (2014-07-09) - don't define _CRT_SECURE_NO_WARNINGS (affects user code) - fixes to stbi__cleanup_jpeg path - added STBI_ASSERT to avoid requiring assert.h - 1.41 (2014-06-25) - fix search&replace from 1.36 that messed up comments/error - messages 1.40 (2014-06-22) fix gcc struct-initialization warning 1.39 - (2014-06-15) fix to TGA optimization when req_comp != number of components in - TGA; fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my - test suite) add support for BMP version 5 (more ignored fields) 1.38 - (2014-06-06) suppress MSVC warnings on integer casts truncating values fix - accidental rename of 'skip' field of I/O 1.37 (2014-06-04) remove duplicate - typedef 1.36 (2014-06-03) convert to header file single-file library if - de-iphone isn't set, load iphone images color-swapped instead of returning - NULL 1.35 (2014-05-27) various warnings fix broken STBI_SIMD path fix bug - where stbi_load_from_file no longer left file pointer in correct place fix - broken non-easy path for 32-bit BMP (possibly never used) TGA optimization by - Arseny Kapoulkine 1.34 (unknown) use STBI_NOTUSED in - stbi__resample_row_generic(), fix one more leak in tga failure case 1.33 - (2011-07-14) make stbi_is_hdr work in STBI_NO_HDR (as specified), minor - compiler-friendly improvements 1.32 (2011-07-13) support for "info" function - for all supported filetypes (SpartanJ) 1.31 (2011-06-20) a few more leak - fixes, bug in PNG handling (SpartanJ) 1.30 (2011-06-11) added ability to - load files via callbacks to accomidate custom input streams (Ben Wenger) - removed deprecated format-specific test/load functions - removed support for installable file formats (stbi_loader) -- - would have been broken for IO callbacks anyway error cases in bmp and tga - give messages and don't leak (Raymond Barbiero, grisha) fix inefficiency in - decoding 32-bit BMP (David Woo) 1.29 (2010-08-16) various warning fixes from - Aurelien Pocheville 1.28 (2010-08-01) fix bug in GIF palette transparency - (SpartanJ) 1.27 (2010-08-01) cast-to-stbi_uc to fix warnings 1.26 - (2010-07-24) fix bug in file buffering for PNG reported by SpartanJ 1.25 - (2010-07-17) refix trans_data warning (Won Chun) 1.24 (2010-07-12) perf - improvements reading from files on platforms with lock-heavy fgetc() minor - perf improvements for jpeg deprecated type-specific functions so we'll get - feedback if they're needed attempt to fix trans_data warning (Won Chun) 1.23 - fixed bug in iPhone support 1.22 (2010-07-10) removed image *writing* - support stbi_info support from Jetro Lauha GIF support from Jean-Marc Lienher - iPhone PNG-extensions from James Brown - warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. - Janez (U+017D)emva) 1.21 fix use of 'stbi_uc' in header (reported by jon - blow) 1.20 added support for Softimage PIC, by Tom Seddon 1.19 bug in - interlaced PNG corruption check (found by ryg) 1.18 (2008-08-02) fix a - threading bug (local mutable static) 1.17 support interlaced PNG 1.16 - major bugfix - stbi__convert_format converted one too many pixels 1.15 - initialize some fields for thread safety 1.14 fix threadsafe conversion - bug header-file-only version (#define STBI_HEADER_FILE_ONLY before including) - 1.13 threadsafe - 1.12 const qualifiers in the API - 1.11 Support installable IDCT, colorspace conversion routines - 1.10 Fixes for 64-bit (don't use "unsigned long") - optimized upsampling by Fabian "ryg" Giesen - 1.09 Fix format-conversion for PSD code (bad global variables!) - 1.08 Thatcher Ulrich's PSD code integrated by Nicolas Schulz - 1.07 attempt to fix C++ warning/errors again - 1.06 attempt to fix C++ warning/errors again - 1.05 fix TGA loading to return correct *comp and use good luminance - calc 1.04 default float alpha is 1, not 255; use 'void *' for - stbi_image_free 1.03 bugfixes to STBI_NO_STDIO, STBI_NO_HDR 1.02 support - for (subset of) HDR files, float interface for preferred access to them 1.01 - fix bug: possible bug in handling right-side up bmps... not sure fix bug: the - stbi__bmp_load() and stbi__tga_load() functions didn't work at all 1.00 - interface to zlib that skips zlib header 0.99 correct handling of alpha in - palette 0.98 TGA loader by lonesock; dynamically add loaders (untested) - 0.97 jpeg errors on too large a file; also catch another malloc failure - 0.96 fix detection of invalid v value - particleman@mollyrocket forum - 0.95 during header scan, seek to markers in case of padding - 0.94 STBI_NO_STDIO to disable stdio usage; rename all #defines the same - 0.93 handle jpegtran output; verbose errors - 0.92 read 4,8,16,24,32-bit BMP files of several formats - 0.91 output 24-bit Windows 3.0 BMP files - 0.90 fix a few more warnings; bump version number to approach 1.0 - 0.61 bugfixes due to Marc LeBlanc, Christopher Lloyd - 0.60 fix compiling as c++ - 0.59 fix warnings: merge Dave Moore's -Wall fixes - 0.58 fix bug: zlib uncompressed mode len/nlen was wrong endian - 0.57 fix bug: jpg last huffman symbol before marker was >9 bits but - less than 16 available 0.56 fix bug: zlib uncompressed mode len vs. nlen - 0.55 fix bug: restart_interval not initialized to 0 - 0.54 allow NULL for 'int *comp' - 0.53 fix bug in png 3->4; speedup png decoding - 0.52 png handles req_comp=3,4 directly; minor cleanup; jpeg comments - 0.51 obey req_comp requests, 1-component jpegs return as 1-component, - on 'test' only check type, not whether we support this variant - 0.50 (2006-11-19) - first released version -*/ - -/* ------------------------------------------------------------------------------- -This software is available under 2 licenses -- choose whichever you prefer. ------------------------------------------------------------------------------- -ALTERNATIVE A - MIT License -Copyright (c) 2017 Sean Barrett -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. ------------------------------------------------------------------------------- -ALTERNATIVE B - Public Domain (www.unlicense.org) -This is free and unencumbered software released into the public domain. -Anyone is free to copy, modify, publish, use, compile, sell, or distribute this -software, either in source code form or as a compiled binary, for any purpose, -commercial or non-commercial, and by any means. -In jurisdictions that recognize copyright laws, the author or authors of this -software dedicate any and all copyright interest in the software to the public -domain. We make this dedication for the benefit of the public at large and to -the detriment of our heirs and successors. We intend this dedication to be an -overt act of relinquishment in perpetuity of all present and future rights to -this software under copyright law. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ------------------------------------------------------------------------------- -*/ diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h deleted file mode 100644 index 84b84981b44876c35c9bb6cce1af402ec302c3eb..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h +++ /dev/null @@ -1,1933 +0,0 @@ -/* stb_image_write - v1.13 - public domain - http://nothings.org/stb - writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015 - no warranty implied; use at your own risk - - Before #including, - - #define STB_IMAGE_WRITE_IMPLEMENTATION - - in the file that you want to have the implementation. - - Will probably not work correctly with strict-aliasing optimizations. - -ABOUT: - - This header file is a library for writing images to C stdio or a callback. - - The PNG output is not optimal; it is 20-50% larger than the file - written by a decent optimizing implementation; though providing a custom - zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that. - This library is designed for source code compactness and simplicity, - not optimal image file size or run-time performance. - -BUILDING: - - You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h. - You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace - malloc,realloc,free. - You can #define STBIW_MEMMOVE() to replace memmove() - You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress -function for PNG compression (instead of the builtin one), it must have the -following signature: unsigned char * my_compress(unsigned char *data, int -data_len, int *out_len, int quality); The returned data will be freed with -STBIW_FREE() (free() by default), so it must be heap allocated with -STBIW_MALLOC() (malloc() by default), - -UNICODE: - - If compiling for Windows and you wish to use Unicode filenames, compile - with - #define STBIW_WINDOWS_UTF8 - and pass utf8-encoded filenames. Call stbiw_convert_wchar_to_utf8 to convert - Windows wchar_t filenames to utf8. - -USAGE: - - There are five functions, one for each image file format: - - int stbi_write_png(char const *filename, int w, int h, int comp, const void -*data, int stride_in_bytes); int stbi_write_bmp(char const *filename, int w, int -h, int comp, const void *data); int stbi_write_tga(char const *filename, int w, -int h, int comp, const void *data); int stbi_write_jpg(char const *filename, int -w, int h, int comp, const void *data, int quality); int stbi_write_hdr(char -const *filename, int w, int h, int comp, const float *data); - - void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip -data vertically - - There are also five equivalent functions that use an arbitrary write -function. You are expected to open/close your file-equivalent before and after -calling these: - - int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int -h, int comp, const void *data, int stride_in_bytes); int -stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int -comp, const void *data); int stbi_write_tga_to_func(stbi_write_func *func, void -*context, int w, int h, int comp, const void *data); int -stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int -comp, const float *data); int stbi_write_jpg_to_func(stbi_write_func *func, void -*context, int x, int y, int comp, const void *data, int quality); - - where the callback is: - void stbi_write_func(void *context, void *data, int size); - - You can configure it with these global variables: - int stbi_write_tga_with_rle; // defaults to true; set to 0 to -disable RLE int stbi_write_png_compression_level; // defaults to 8; set to -higher for more compression int stbi_write_force_png_filter; // defaults -to -1; set to 0..5 to force a filter mode - - - You can define STBI_WRITE_NO_STDIO to disable the file variant of these - functions, so the library will not use stdio.h at all. However, this will - also disable HDR writing, because it requires stdio for formatted output. - - Each function returns 0 on failure and non-0 on success. - - The functions create an image file defined by the parameters. The image - is a rectangle of pixels stored from left-to-right, top-to-bottom. - Each pixel contains 'comp' channels of data stored interleaved with 8-bits - per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is - monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall. - The *data pointer points to the first byte of the top-left-most pixel. - For PNG, "stride_in_bytes" is the distance in bytes from the first byte of - a row of pixels to the first byte of the next row of pixels. - - PNG creates output files with the same number of components as the input. - The BMP format expands Y to RGB in the file format and does not - output alpha. - - PNG supports writing rectangles of data even when the bytes storing rows of - data are not consecutive in memory (e.g. sub-rectangles of a larger image), - by supplying the stride between the beginning of adjacent rows. The other - formats do not. (Thus you cannot write a native-format BMP through the BMP - writer, both because it is in BGR order and because it may have padding - at the end of the line.) - - PNG allows you to set the deflate compression level by setting the global - variable 'stbi_write_png_compression_level' (it defaults to 8). - - HDR expects linear float data. Since the format is always 32-bit rgb(e) - data, alpha (if provided) is discarded, and for monochrome data it is - replicated across all three channels. - - TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed - data, set the global variable 'stbi_write_tga_with_rle' to 0. - - JPEG does ignore alpha channels in input data; quality is between 1 and 100. - Higher quality looks better but results in a bigger image. - JPEG baseline (no JPEG progressive). - -CREDITS: - - - Sean Barrett - PNG/BMP/TGA - Baldur Karlsson - HDR - Jean-Sebastien Guay - TGA monochrome - Tim Kelsey - misc enhancements - Alan Hickman - TGA RLE - Emmanuel Julien - initial file IO callback implementation - Jon Olick - original jo_jpeg.cpp code - Daniel Gibson - integrate JPEG, allow external zlib - Aarni Koskela - allow choosing PNG filter - - bugfixes: - github:Chribba - Guillaume Chereau - github:jry2 - github:romigrou - Sergio Gonzalez - Jonas Karlsson - Filip Wasil - Thatcher Ulrich - github:poppolopoppo - Patrick Boettcher - github:xeekworx - Cap Petschulat - Simon Rodriguez - Ivan Tikhonov - github:ignotion - Adam Schackart - -LICENSE - - See end of file for license information. - -*/ - -#ifndef INCLUDE_STB_IMAGE_WRITE_H -#define INCLUDE_STB_IMAGE_WRITE_H - -#include <stdlib.h> - -// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' -// or 'static inline' -#ifndef STBIWDEF -#ifdef STB_IMAGE_WRITE_STATIC -#define STBIWDEF static -#else -#ifdef __cplusplus -#define STBIWDEF extern "C" -#else -#define STBIWDEF extern -#endif -#endif -#endif - -#ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations -extern int stbi_write_tga_with_rle; -extern int stbi_write_png_compression_level; -extern int stbi_write_force_png_filter; -#endif - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, - const void *data, int stride_in_bytes); -STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, - const void *data); -STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, - const void *data); -STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, - const float *data); -STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, - const void *data, int quality); - -#ifdef STBI_WINDOWS_UTF8 -STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, - const wchar_t *input); -#endif -#endif - -typedef void stbi_write_func(void *context, void *data, int size); - -STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, - int h, int comp, const void *data, - int stride_in_bytes); -STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, - int h, int comp, const void *data); -STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, - int h, int comp, const void *data); -STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, - int h, int comp, const float *data); -STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const void *data, - int quality); - -STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean); - -#endif // INCLUDE_STB_IMAGE_WRITE_H - -#ifdef STB_IMAGE_WRITE_IMPLEMENTATION - -#ifdef _WIN32 -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif -#ifndef _CRT_NONSTDC_NO_DEPRECATE -#define _CRT_NONSTDC_NO_DEPRECATE -#endif -#endif - -#ifndef STBI_WRITE_NO_STDIO -#include <stdio.h> -#endif // STBI_WRITE_NO_STDIO - -#include <math.h> -#include <stdarg.h> -#include <stdlib.h> -#include <string.h> - -#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && \ - (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED)) -// ok -#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && \ - !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED) -// ok -#else -#error \ - "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)." -#endif - -#ifndef STBIW_MALLOC -#define STBIW_MALLOC(sz) malloc(sz) -#define STBIW_REALLOC(p, newsz) realloc(p, newsz) -#define STBIW_FREE(p) free(p) -#endif - -#ifndef STBIW_REALLOC_SIZED -#define STBIW_REALLOC_SIZED(p, oldsz, newsz) STBIW_REALLOC(p, newsz) -#endif - -#ifndef STBIW_MEMMOVE -#define STBIW_MEMMOVE(a, b, sz) memmove(a, b, sz) -#endif - -#ifndef STBIW_ASSERT -#include <assert.h> -#define STBIW_ASSERT(x) assert(x) -#endif - -#define STBIW_UCHAR(x) (unsigned char)((x)&0xff) - -#ifdef STB_IMAGE_WRITE_STATIC -static int stbi__flip_vertically_on_write = 0; -static int stbi_write_png_compression_level = 8; -static int stbi_write_tga_with_rle = 1; -static int stbi_write_force_png_filter = -1; -#else -int stbi_write_png_compression_level = 8; -int stbi__flip_vertically_on_write = 0; -int stbi_write_tga_with_rle = 1; -int stbi_write_force_png_filter = -1; -#endif - -STBIWDEF void stbi_flip_vertically_on_write(int flag) { - stbi__flip_vertically_on_write = flag; -} - -typedef struct { - stbi_write_func *func; - void *context; -} stbi__write_context; - -// initialize a callback-based context -static void stbi__start_write_callbacks(stbi__write_context *s, - stbi_write_func *c, void *context) { - s->func = c; - s->context = context; -} - -#ifndef STBI_WRITE_NO_STDIO - -static void stbi__stdio_write(void *context, void *data, int size) { - fwrite(data, 1, size, (FILE *)context); -} - -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) -#ifdef __cplusplus -#define STBIW_EXTERN extern "C" -#else -#define STBIW_EXTERN extern -#endif -STBIW_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar( - unsigned int cp, unsigned long flags, const char *str, int cbmb, - wchar_t *widestr, int cchwide); -STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte( - unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, - char *str, int cbmb, const char *defchar, int *used_default); - -STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, - const wchar_t *input) { - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, - (int)bufferlen, NULL, NULL); -} -#endif - -static FILE *stbiw__fopen(char const *filename, char const *mode) { - FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) - wchar_t wMode[64]; - wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, - sizeof(wFilename))) - return 0; - - if (0 == - MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) - return 0; - -#if _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; -#else - f = _wfopen(wFilename, wMode); -#endif - -#elif defined(_MSC_VER) && _MSC_VER >= 1400 - if (0 != fopen_s(&f, filename, mode)) - f = 0; -#else - f = fopen(filename, mode); -#endif - return f; -} - -static int stbi__start_write_file(stbi__write_context *s, - const char *filename) { - FILE *f = stbiw__fopen(filename, "wb"); - stbi__start_write_callbacks(s, stbi__stdio_write, (void *)f); - return f != NULL; -} - -static void stbi__end_write_file(stbi__write_context *s) { - fclose((FILE *)s->context); -} - -#endif // !STBI_WRITE_NO_STDIO - -typedef unsigned int stbiw_uint32; -typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1]; - -static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) { - while (*fmt) { - switch (*fmt++) { - case ' ': - break; - case '1': { - unsigned char x = STBIW_UCHAR(va_arg(v, int)); - s->func(s->context, &x, 1); - break; - } - case '2': { - int x = va_arg(v, int); - unsigned char b[2]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - s->func(s->context, b, 2); - break; - } - case '4': { - stbiw_uint32 x = va_arg(v, int); - unsigned char b[4]; - b[0] = STBIW_UCHAR(x); - b[1] = STBIW_UCHAR(x >> 8); - b[2] = STBIW_UCHAR(x >> 16); - b[3] = STBIW_UCHAR(x >> 24); - s->func(s->context, b, 4); - break; - } - default: - STBIW_ASSERT(0); - return; - } - } -} - -static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) { - va_list v; - va_start(v, fmt); - stbiw__writefv(s, fmt, v); - va_end(v); -} - -static void stbiw__putc(stbi__write_context *s, unsigned char c) { - s->func(s->context, &c, 1); -} - -static void stbiw__write3(stbi__write_context *s, unsigned char a, - unsigned char b, unsigned char c) { - unsigned char arr[3]; - arr[0] = a; - arr[1] = b; - arr[2] = c; - s->func(s->context, arr, 3); -} - -static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, - int write_alpha, int expand_mono, - unsigned char *d) { - unsigned char bg[3] = {255, 0, 255}, px[3]; - int k; - - if (write_alpha < 0) - s->func(s->context, &d[comp - 1], 1); - - switch (comp) { - case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as - // 1-channel case - case 1: - if (expand_mono) - stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp - else - s->func(s->context, d, 1); // monochrome TGA - break; - case 4: - if (!write_alpha) { - // composite against pink background - for (k = 0; k < 3; ++k) - px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255; - stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]); - break; - } - /* FALLTHROUGH */ - case 3: - stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]); - break; - } - if (write_alpha > 0) - s->func(s->context, &d[comp - 1], 1); -} - -static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, - int x, int y, int comp, void *data, - int write_alpha, int scanline_pad, - int expand_mono) { - stbiw_uint32 zero = 0; - int i, j, j_end; - - if (y <= 0) - return; - - if (stbi__flip_vertically_on_write) - vdir *= -1; - - if (vdir < 0) { - j_end = -1; - j = y - 1; - } else { - j_end = y; - j = 0; - } - - for (; j != j_end; j += vdir) { - for (i = 0; i < x; ++i) { - unsigned char *d = (unsigned char *)data + (j * x + i) * comp; - stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d); - } - s->func(s->context, &zero, scanline_pad); - } -} - -static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, - int y, int comp, int expand_mono, void *data, - int alpha, int pad, const char *fmt, ...) { - if (y < 0 || x < 0) { - return 0; - } else { - va_list v; - va_start(v, fmt); - stbiw__writefv(s, fmt, v); - va_end(v); - stbiw__write_pixels(s, rgb_dir, vdir, x, y, comp, data, alpha, pad, - expand_mono); - return 1; - } -} - -static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, - const void *data) { - int pad = (-x * 3) & 3; - return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad, - "11 4 22 4" - "4 44 22 444444", - 'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0, - 14 + 40, // file header - 40, x, y, 1, 24, 0, 0, 0, 0, 0, 0); // bitmap header -} - -STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const void *data) { - stbi__write_context s; - stbi__start_write_callbacks(&s, func, context); - return stbi_write_bmp_core(&s, x, y, comp, data); -} - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, - const void *data) { - stbi__write_context s; - if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_bmp_core(&s, x, y, comp, data); - stbi__end_write_file(&s); - return r; - } else - return 0; -} -#endif //! STBI_WRITE_NO_STDIO - -static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, - void *data) { - int has_alpha = (comp == 2 || comp == 4); - int colorbytes = has_alpha ? comp - 1 : comp; - int format = - colorbytes < 2 - ? 3 - : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3 - - if (y < 0 || x < 0) - return 0; - - if (!stbi_write_tga_with_rle) { - return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *)data, has_alpha, 0, - "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, - (colorbytes + has_alpha) * 8, has_alpha * 8); - } else { - int i, j, k; - int jend, jdir; - - stbiw__writef(s, "111 221 2222 11", 0, 0, format + 8, 0, 0, 0, 0, 0, x, y, - (colorbytes + has_alpha) * 8, has_alpha * 8); - - if (stbi__flip_vertically_on_write) { - j = 0; - jend = y; - jdir = 1; - } else { - j = y - 1; - jend = -1; - jdir = -1; - } - for (; j != jend; j += jdir) { - unsigned char *row = (unsigned char *)data + j * x * comp; - int len; - - for (i = 0; i < x; i += len) { - unsigned char *begin = row + i * comp; - int diff = 1; - len = 1; - - if (i < x - 1) { - ++len; - diff = memcmp(begin, row + (i + 1) * comp, comp); - if (diff) { - const unsigned char *prev = begin; - for (k = i + 2; k < x && len < 128; ++k) { - if (memcmp(prev, row + k * comp, comp)) { - prev += comp; - ++len; - } else { - --len; - break; - } - } - } else { - for (k = i + 2; k < x && len < 128; ++k) { - if (!memcmp(begin, row + k * comp, comp)) { - ++len; - } else { - break; - } - } - } - } - - if (diff) { - unsigned char header = STBIW_UCHAR(len - 1); - s->func(s->context, &header, 1); - for (k = 0; k < len; ++k) { - stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp); - } - } else { - unsigned char header = STBIW_UCHAR(len - 129); - s->func(s->context, &header, 1); - stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin); - } - } - } - } - return 1; -} - -STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const void *data) { - stbi__write_context s; - stbi__start_write_callbacks(&s, func, context); - return stbi_write_tga_core(&s, x, y, comp, (void *)data); -} - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, - const void *data) { - stbi__write_context s; - if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_tga_core(&s, x, y, comp, (void *)data); - stbi__end_write_file(&s); - return r; - } else - return 0; -} -#endif - -// ************************************************************************************************* -// Radiance RGBE HDR writer -// by Baldur Karlsson - -#define stbiw__max(a, b) ((a) > (b) ? (a) : (b)) - -static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) { - int exponent; - float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2])); - - if (maxcomp < 1e-32f) { - rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0; - } else { - float normalize = (float)frexp(maxcomp, &exponent) * 256.0f / maxcomp; - - rgbe[0] = (unsigned char)(linear[0] * normalize); - rgbe[1] = (unsigned char)(linear[1] * normalize); - rgbe[2] = (unsigned char)(linear[2] * normalize); - rgbe[3] = (unsigned char)(exponent + 128); - } -} - -static void stbiw__write_run_data(stbi__write_context *s, int length, - unsigned char databyte) { - unsigned char lengthbyte = STBIW_UCHAR(length + 128); - STBIW_ASSERT(length + 128 <= 255); - s->func(s->context, &lengthbyte, 1); - s->func(s->context, &databyte, 1); -} - -static void stbiw__write_dump_data(stbi__write_context *s, int length, - unsigned char *data) { - unsigned char lengthbyte = STBIW_UCHAR(length); - STBIW_ASSERT(length <= - 128); // inconsistent with spec but consistent with official code - s->func(s->context, &lengthbyte, 1); - s->func(s->context, data, length); -} - -static void stbiw__write_hdr_scanline(stbi__write_context *s, int width, - int ncomp, unsigned char *scratch, - float *scanline) { - unsigned char scanlineheader[4] = {2, 2, 0, 0}; - unsigned char rgbe[4]; - float linear[3]; - int x; - - scanlineheader[2] = (width & 0xff00) >> 8; - scanlineheader[3] = (width & 0x00ff); - - /* skip RLE for images too small or large */ - if (width < 8 || width >= 32768) { - for (x = 0; x < width; x++) { - switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; - } - stbiw__linear_to_rgbe(rgbe, linear); - s->func(s->context, rgbe, 4); - } - } else { - int c, r; - /* encode into scratch buffer */ - for (x = 0; x < width; x++) { - switch (ncomp) { - case 4: /* fallthrough */ - case 3: - linear[2] = scanline[x * ncomp + 2]; - linear[1] = scanline[x * ncomp + 1]; - linear[0] = scanline[x * ncomp + 0]; - break; - default: - linear[0] = linear[1] = linear[2] = scanline[x * ncomp + 0]; - break; - } - stbiw__linear_to_rgbe(rgbe, linear); - scratch[x + width * 0] = rgbe[0]; - scratch[x + width * 1] = rgbe[1]; - scratch[x + width * 2] = rgbe[2]; - scratch[x + width * 3] = rgbe[3]; - } - - s->func(s->context, scanlineheader, 4); - - /* RLE each component separately */ - for (c = 0; c < 4; c++) { - unsigned char *comp = &scratch[width * c]; - - x = 0; - while (x < width) { - // find first run - r = x; - while (r + 2 < width) { - if (comp[r] == comp[r + 1] && comp[r] == comp[r + 2]) - break; - ++r; - } - if (r + 2 >= width) - r = width; - // dump up to first run - while (x < r) { - int len = r - x; - if (len > 128) - len = 128; - stbiw__write_dump_data(s, len, &comp[x]); - x += len; - } - // if there's a run, output it - if (r + 2 < width) { // same test as what we break out of in search - // loop, so only true if we break'd - // find next byte after run - while (r < width && comp[r] == comp[x]) - ++r; - // output run up to r - while (x < r) { - int len = r - x; - if (len > 127) - len = 127; - stbiw__write_run_data(s, len, comp[x]); - x += len; - } - } - } - } - } -} - -static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, - float *data) { - if (y <= 0 || x <= 0 || data == NULL) - return 0; - else { - // Each component is stored separately. Allocate scratch space for full - // output scanline. - unsigned char *scratch = (unsigned char *)STBIW_MALLOC(x * 4); - int i, len; - char buffer[128]; - char header[] = - "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; - s->func(s->context, header, sizeof(header) - 1); - -#ifdef __STDC_WANT_SECURE_LIB__ - len = - sprintf_s(buffer, sizeof(buffer), - "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); -#else - len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", - y, x); -#endif - s->func(s->context, buffer, len); - - for (i = 0; i < y; i++) - stbiw__write_hdr_scanline( - s, x, comp, scratch, - data + comp * x * (stbi__flip_vertically_on_write ? y - 1 - i : i)); - STBIW_FREE(scratch); - return 1; - } -} - -STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const float *data) { - stbi__write_context s; - stbi__start_write_callbacks(&s, func, context); - return stbi_write_hdr_core(&s, x, y, comp, (float *)data); -} - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, - const float *data) { - stbi__write_context s; - if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_hdr_core(&s, x, y, comp, (float *)data); - stbi__end_write_file(&s); - return r; - } else - return 0; -} -#endif // STBI_WRITE_NO_STDIO - -////////////////////////////////////////////////////////////////////////////// -// -// PNG writer -// - -#ifndef STBIW_ZLIB_COMPRESS -// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() -// == vector<>::size() -#define stbiw__sbraw(a) ((int *)(a)-2) -#define stbiw__sbm(a) stbiw__sbraw(a)[0] -#define stbiw__sbn(a) stbiw__sbraw(a)[1] - -#define stbiw__sbneedgrow(a, n) ((a) == 0 || stbiw__sbn(a) + n >= stbiw__sbm(a)) -#define stbiw__sbmaybegrow(a, n) \ - (stbiw__sbneedgrow(a, (n)) ? stbiw__sbgrow(a, n) : 0) -#define stbiw__sbgrow(a, n) stbiw__sbgrowf((void **)&(a), (n), sizeof(*(a))) - -#define stbiw__sbpush(a, v) \ - (stbiw__sbmaybegrow(a, 1), (a)[stbiw__sbn(a)++] = (v)) -#define stbiw__sbcount(a) ((a) ? stbiw__sbn(a) : 0) -#define stbiw__sbfree(a) ((a) ? STBIW_FREE(stbiw__sbraw(a)), 0 : 0) - -static void *stbiw__sbgrowf(void **arr, int increment, int itemsize) { - int m = *arr ? 2 * stbiw__sbm(*arr) + increment : increment + 1; - void *p = STBIW_REALLOC_SIZED( - *arr ? stbiw__sbraw(*arr) : 0, - *arr ? (stbiw__sbm(*arr) * itemsize + sizeof(int) * 2) : 0, - itemsize * m + sizeof(int) * 2); - STBIW_ASSERT(p); - if (p) { - if (!*arr) - ((int *)p)[1] = 0; - *arr = (void *)((int *)p + 2); - stbiw__sbm(*arr) = m; - } - return *arr; -} - -static unsigned char *stbiw__zlib_flushf(unsigned char *data, - unsigned int *bitbuffer, - int *bitcount) { - while (*bitcount >= 8) { - stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer)); - *bitbuffer >>= 8; - *bitcount -= 8; - } - return data; -} - -static int stbiw__zlib_bitrev(int code, int codebits) { - int res = 0; - while (codebits--) { - res = (res << 1) | (code & 1); - code >>= 1; - } - return res; -} - -static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, - int limit) { - int i; - for (i = 0; i < limit && i < 258; ++i) - if (a[i] != b[i]) - break; - return i; -} - -static unsigned int stbiw__zhash(unsigned char *data) { - stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16); - hash ^= hash << 3; - hash += hash >> 5; - hash ^= hash << 4; - hash += hash >> 17; - hash ^= hash << 25; - hash += hash >> 6; - return hash; -} - -#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount)) -#define stbiw__zlib_add(code, codebits) \ - (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush()) -#define stbiw__zlib_huffa(b, c) stbiw__zlib_add(stbiw__zlib_bitrev(b, c), c) -// default huffman tables -#define stbiw__zlib_huff1(n) stbiw__zlib_huffa(0x30 + (n), 8) -#define stbiw__zlib_huff2(n) stbiw__zlib_huffa(0x190 + (n)-144, 9) -#define stbiw__zlib_huff3(n) stbiw__zlib_huffa(0 + (n)-256, 7) -#define stbiw__zlib_huff4(n) stbiw__zlib_huffa(0xc0 + (n)-280, 8) -#define stbiw__zlib_huff(n) \ - ((n) <= 143 ? stbiw__zlib_huff1(n) \ - : (n) <= 255 ? stbiw__zlib_huff2(n) \ - : (n) <= 279 ? stbiw__zlib_huff3(n) \ - : stbiw__zlib_huff4(n)) -#define stbiw__zlib_huffb(n) \ - ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n)) - -#define stbiw__ZHASH 16384 - -#endif // STBIW_ZLIB_COMPRESS - -STBIWDEF unsigned char *stbi_zlib_compress(unsigned char *data, int data_len, - int *out_len, int quality) { -#ifdef STBIW_ZLIB_COMPRESS - // user provided a zlib compress implementation, use that - return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality); -#else // use builtin - static unsigned short lengthc[] = { - 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, - 31, 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 259}; - static unsigned char lengtheb[] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, - 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, - 4, 4, 4, 4, 5, 5, 5, 5, 0}; - static unsigned short distc[] = { - 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, - 49, 65, 97, 129, 193, 257, 385, 513, 769, 1025, 1537, - 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 32768}; - static unsigned char disteb[] = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, - 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, - 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; - unsigned int bitbuf = 0; - int i, j, bitcount = 0; - unsigned char *out = NULL; - unsigned char ***hash_table = - (unsigned char ***)STBIW_MALLOC(stbiw__ZHASH * sizeof(unsigned char **)); - if (hash_table == NULL) - return NULL; - if (quality < 5) - quality = 5; - - stbiw__sbpush(out, 0x78); // DEFLATE 32K window - stbiw__sbpush(out, 0x5e); // FLEVEL = 1 - stbiw__zlib_add(1, 1); // BFINAL = 1 - stbiw__zlib_add(1, 2); // BTYPE = 1 -- fixed huffman - - for (i = 0; i < stbiw__ZHASH; ++i) - hash_table[i] = NULL; - - i = 0; - while (i < data_len - 3) { - // hash next 3 bytes of data to be compressed - int h = stbiw__zhash(data + i) & (stbiw__ZHASH - 1), best = 3; - unsigned char *bestloc = 0; - unsigned char **hlist = hash_table[h]; - int n = stbiw__sbcount(hlist); - for (j = 0; j < n; ++j) { - if (hlist[j] - data > i - 32768) { // if entry lies within window - int d = stbiw__zlib_countm(hlist[j], data + i, data_len - i); - if (d >= best) { - best = d; - bestloc = hlist[j]; - } - } - } - // when hash table entry is too long, delete half the entries - if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2 * quality) { - STBIW_MEMMOVE(hash_table[h], hash_table[h] + quality, - sizeof(hash_table[h][0]) * quality); - stbiw__sbn(hash_table[h]) = quality; - } - stbiw__sbpush(hash_table[h], data + i); - - if (bestloc) { - // "lazy matching" - check match at *next* byte, and if it's better, do - // cur byte as literal - h = stbiw__zhash(data + i + 1) & (stbiw__ZHASH - 1); - hlist = hash_table[h]; - n = stbiw__sbcount(hlist); - for (j = 0; j < n; ++j) { - if (hlist[j] - data > i - 32767) { - int e = stbiw__zlib_countm(hlist[j], data + i + 1, data_len - i - 1); - if (e > best) { // if next match is better, bail on current match - bestloc = NULL; - break; - } - } - } - } - - if (bestloc) { - int d = (int)(data + i - bestloc); // distance back - STBIW_ASSERT(d <= 32767 && best <= 258); - for (j = 0; best > lengthc[j + 1] - 1; ++j) - ; - stbiw__zlib_huff(j + 257); - if (lengtheb[j]) - stbiw__zlib_add(best - lengthc[j], lengtheb[j]); - for (j = 0; d > distc[j + 1] - 1; ++j) - ; - stbiw__zlib_add(stbiw__zlib_bitrev(j, 5), 5); - if (disteb[j]) - stbiw__zlib_add(d - distc[j], disteb[j]); - i += best; - } else { - stbiw__zlib_huffb(data[i]); - ++i; - } - } - // write out final bytes - for (; i < data_len; ++i) - stbiw__zlib_huffb(data[i]); - stbiw__zlib_huff(256); // end of block - // pad with 0 bits to byte boundary - while (bitcount) - stbiw__zlib_add(0, 1); - - for (i = 0; i < stbiw__ZHASH; ++i) - (void)stbiw__sbfree(hash_table[i]); - STBIW_FREE(hash_table); - - { - // compute adler32 on input - unsigned int s1 = 1, s2 = 0; - int blocklen = (int)(data_len % 5552); - j = 0; - while (j < data_len) { - for (i = 0; i < blocklen; ++i) { - s1 += data[j + i]; - s2 += s1; - } - s1 %= 65521; - s2 %= 65521; - j += blocklen; - blocklen = 5552; - } - stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8)); - stbiw__sbpush(out, STBIW_UCHAR(s2)); - stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8)); - stbiw__sbpush(out, STBIW_UCHAR(s1)); - } - *out_len = stbiw__sbn(out); - // make returned pointer freeable - STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len); - return (unsigned char *)stbiw__sbraw(out); -#endif // STBIW_ZLIB_COMPRESS -} - -static unsigned int stbiw__crc32(unsigned char *buffer, int len) { -#ifdef STBIW_CRC32 - return STBIW_CRC32(buffer, len); -#else - static unsigned int crc_table[256] = { - 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, - 0xE963A535, 0x9E6495A3, 0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, - 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, - 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, - 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, - 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, - 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C, - 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, - 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, - 0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, - 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106, - 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, - 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, - 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, - 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, - 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, - 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, - 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, - 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, - 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, - 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, - 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, - 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, - 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, - 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, - 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, - 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E, - 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, - 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, - 0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, - 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, - 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, - 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, - 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, - 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242, - 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, - 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, - 0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, - 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, - 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, - 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, - 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, - 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D}; - - unsigned int crc = ~0u; - int i; - for (i = 0; i < len; ++i) - crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)]; - return ~crc; -#endif -} - -#define stbiw__wpng4(o, a, b, c, d) \ - ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), \ - (o)[3] = STBIW_UCHAR(d), (o) += 4) -#define stbiw__wp32(data, v) \ - stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v)); -#define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3]) - -static void stbiw__wpcrc(unsigned char **data, int len) { - unsigned int crc = stbiw__crc32(*data - len - 4, len + 4); - stbiw__wp32(*data, crc); -} - -static unsigned char stbiw__paeth(int a, int b, int c) { - int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c); - if (pa <= pb && pa <= pc) - return STBIW_UCHAR(a); - if (pb <= pc) - return STBIW_UCHAR(b); - return STBIW_UCHAR(c); -} - -// @OPTIMIZE: provide an option that always forces left-predict or paeth predict -static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, - int width, int height, int y, int n, - int filter_type, signed char *line_buffer) { - static int mapping[] = {0, 1, 2, 3, 4}; - static int firstmap[] = {0, 1, 0, 5, 6}; - int *mymap = (y != 0) ? mapping : firstmap; - int i; - int type = mymap[filter_type]; - unsigned char *z = - pixels + - stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y); - int signed_stride = - stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes; - - if (type == 0) { - memcpy(line_buffer, z, width * n); - return; - } - - // first loop isn't optimized since it's just one pixel - for (i = 0; i < n; ++i) { - switch (type) { - case 1: - line_buffer[i] = z[i]; - break; - case 2: - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - line_buffer[i] = z[i] - (z[i - signed_stride] >> 1); - break; - case 4: - line_buffer[i] = - (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0)); - break; - case 5: - line_buffer[i] = z[i]; - break; - case 6: - line_buffer[i] = z[i]; - break; - } - } - switch (type) { - case 1: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - z[i - n]; - break; - case 2: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - z[i - signed_stride]; - break; - case 3: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1); - break; - case 4: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride], - z[i - signed_stride - n]); - break; - case 5: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - (z[i - n] >> 1); - break; - case 6: - for (i = n; i < width * n; ++i) - line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0); - break; - } -} - -STBIWDEF unsigned char *stbi_write_png_to_mem(const unsigned char *pixels, - int stride_bytes, int x, int y, - int n, int *out_len) { - int force_filter = stbi_write_force_png_filter; - int ctype[5] = {-1, 0, 4, 2, 6}; - unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10}; - unsigned char *out, *o, *filt, *zlib; - signed char *line_buffer; - int j, zlen; - - if (stride_bytes == 0) - stride_bytes = x * n; - - if (force_filter >= 5) { - force_filter = -1; - } - - filt = (unsigned char *)STBIW_MALLOC((x * n + 1) * y); - if (!filt) - return 0; - line_buffer = (signed char *)STBIW_MALLOC(x * n); - if (!line_buffer) { - STBIW_FREE(filt); - return 0; - } - for (j = 0; j < y; ++j) { - int filter_type; - if (force_filter > -1) { - filter_type = force_filter; - stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j, - n, force_filter, line_buffer); - } else { // Estimate the best filter by running through all of them: - int best_filter = 0, best_filter_val = 0x7fffffff, est, i; - for (filter_type = 0; filter_type < 5; filter_type++) { - stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j, - n, filter_type, line_buffer); - - // Estimate the entropy of the line using this filter; the less, the - // better. - est = 0; - for (i = 0; i < x * n; ++i) { - est += abs((signed char)line_buffer[i]); - } - if (est < best_filter_val) { - best_filter_val = est; - best_filter = filter_type; - } - } - if (filter_type != best_filter) { // If the last iteration already got us - // the best filter, don't redo it - stbiw__encode_png_line((unsigned char *)(pixels), stride_bytes, x, y, j, - n, best_filter, line_buffer); - filter_type = best_filter; - } - } - // when we get here, filter_type contains the filter type, and line_buffer - // contains the data - filt[j * (x * n + 1)] = (unsigned char)filter_type; - STBIW_MEMMOVE(filt + j * (x * n + 1) + 1, line_buffer, x * n); - } - STBIW_FREE(line_buffer); - zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen, - stbi_write_png_compression_level); - STBIW_FREE(filt); - if (!zlib) - return 0; - - // each tag requires 12 bytes of overhead - out = (unsigned char *)STBIW_MALLOC(8 + 12 + 13 + 12 + zlen + 12); - if (!out) - return 0; - *out_len = 8 + 12 + 13 + 12 + zlen + 12; - - o = out; - STBIW_MEMMOVE(o, sig, 8); - o += 8; - stbiw__wp32(o, 13); // header length - stbiw__wptag(o, "IHDR"); - stbiw__wp32(o, x); - stbiw__wp32(o, y); - *o++ = 8; - *o++ = STBIW_UCHAR(ctype[n]); - *o++ = 0; - *o++ = 0; - *o++ = 0; - stbiw__wpcrc(&o, 13); - - stbiw__wp32(o, zlen); - stbiw__wptag(o, "IDAT"); - STBIW_MEMMOVE(o, zlib, zlen); - o += zlen; - STBIW_FREE(zlib); - stbiw__wpcrc(&o, zlen); - - stbiw__wp32(o, 0); - stbiw__wptag(o, "IEND"); - stbiw__wpcrc(&o, 0); - - STBIW_ASSERT(o == out + *out_len); - - return out; -} - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, - const void *data, int stride_bytes) { - FILE *f; - int len; - unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data, - stride_bytes, x, y, comp, &len); - if (png == NULL) - return 0; - - f = stbiw__fopen(filename, "wb"); - if (!f) { - STBIW_FREE(png); - return 0; - } - fwrite(png, 1, len, f); - fclose(f); - STBIW_FREE(png); - return 1; -} -#endif - -STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const void *data, - int stride_bytes) { - int len; - unsigned char *png = stbi_write_png_to_mem((const unsigned char *)data, - stride_bytes, x, y, comp, &len); - if (png == NULL) - return 0; - func(context, png, len); - STBIW_FREE(png); - return 1; -} - -/* *************************************************************************** - * - * JPEG writer - * - * This is based on Jon Olick's jo_jpeg.cpp: - * public domain Simple, Minimalistic JPEG writer - - * http://www.jonolick.com/code.html - */ - -static const unsigned char stbiw__jpg_ZigZag[] = { - 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, - 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, - 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, - 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}; - -static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, - int *bitCntP, const unsigned short *bs) { - int bitBuf = *bitBufP, bitCnt = *bitCntP; - bitCnt += bs[1]; - bitBuf |= bs[0] << (24 - bitCnt); - while (bitCnt >= 8) { - unsigned char c = (bitBuf >> 16) & 255; - stbiw__putc(s, c); - if (c == 255) { - stbiw__putc(s, 0); - } - bitBuf <<= 8; - bitCnt -= 8; - } - *bitBufP = bitBuf; - *bitCntP = bitCnt; -} - -static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, - float *d4p, float *d5p, float *d6p, float *d7p) { - float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, - d6 = *d6p, d7 = *d7p; - float z1, z2, z3, z4, z5, z11, z13; - - float tmp0 = d0 + d7; - float tmp7 = d0 - d7; - float tmp1 = d1 + d6; - float tmp6 = d1 - d6; - float tmp2 = d2 + d5; - float tmp5 = d2 - d5; - float tmp3 = d3 + d4; - float tmp4 = d3 - d4; - - // Even part - float tmp10 = tmp0 + tmp3; // phase 2 - float tmp13 = tmp0 - tmp3; - float tmp11 = tmp1 + tmp2; - float tmp12 = tmp1 - tmp2; - - d0 = tmp10 + tmp11; // phase 3 - d4 = tmp10 - tmp11; - - z1 = (tmp12 + tmp13) * 0.707106781f; // c4 - d2 = tmp13 + z1; // phase 5 - d6 = tmp13 - z1; - - // Odd part - tmp10 = tmp4 + tmp5; // phase 2 - tmp11 = tmp5 + tmp6; - tmp12 = tmp6 + tmp7; - - // The rotator is modified from fig 4-8 to avoid extra negations. - z5 = (tmp10 - tmp12) * 0.382683433f; // c6 - z2 = tmp10 * 0.541196100f + z5; // c2-c6 - z4 = tmp12 * 1.306562965f + z5; // c2+c6 - z3 = tmp11 * 0.707106781f; // c4 - - z11 = tmp7 + z3; // phase 5 - z13 = tmp7 - z3; - - *d5p = z13 + z2; // phase 6 - *d3p = z13 - z2; - *d1p = z11 + z4; - *d7p = z11 - z4; - - *d0p = d0; - *d2p = d2; - *d4p = d4; - *d6p = d6; -} - -static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) { - int tmp1 = val < 0 ? -val : val; - val = val < 0 ? val - 1 : val; - bits[1] = 1; - while (tmp1 >>= 1) { - ++bits[1]; - } - bits[0] = val & ((1 << bits[1]) - 1); -} - -static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, - int *bitCnt, float *CDU, float *fdtbl, int DC, - const unsigned short HTDC[256][2], - const unsigned short HTAC[256][2]) { - const unsigned short EOB[2] = {HTAC[0x00][0], HTAC[0x00][1]}; - const unsigned short M16zeroes[2] = {HTAC[0xF0][0], HTAC[0xF0][1]}; - int dataOff, i, diff, end0pos; - int DU[64]; - - // DCT rows - for (dataOff = 0; dataOff < 64; dataOff += 8) { - stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 1], &CDU[dataOff + 2], - &CDU[dataOff + 3], &CDU[dataOff + 4], &CDU[dataOff + 5], - &CDU[dataOff + 6], &CDU[dataOff + 7]); - } - // DCT columns - for (dataOff = 0; dataOff < 8; ++dataOff) { - stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff + 8], &CDU[dataOff + 16], - &CDU[dataOff + 24], &CDU[dataOff + 32], &CDU[dataOff + 40], - &CDU[dataOff + 48], &CDU[dataOff + 56]); - } - // Quantize/descale/zigzag the coefficients - for (i = 0; i < 64; ++i) { - float v = CDU[i] * fdtbl[i]; - // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + - // 0.5f)); ceilf() and floorf() are C99, not C89, but I /think/ they're not - // needed here anyway? - DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f); - } - - // Encode DC - diff = DU[0] - DC; - if (diff == 0) { - stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]); - } else { - unsigned short bits[2]; - stbiw__jpg_calcBits(diff, bits); - stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]); - stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits); - } - // Encode ACs - end0pos = 63; - for (; (end0pos > 0) && (DU[end0pos] == 0); --end0pos) { - } - // end0pos = first element in reverse order !=0 - if (end0pos == 0) { - stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); - return DU[0]; - } - for (i = 1; i <= end0pos; ++i) { - int startpos = i; - int nrzeroes; - unsigned short bits[2]; - for (; DU[i] == 0 && i <= end0pos; ++i) { - } - nrzeroes = i - startpos; - if (nrzeroes >= 16) { - int lng = nrzeroes >> 4; - int nrmarker; - for (nrmarker = 1; nrmarker <= lng; ++nrmarker) - stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes); - nrzeroes &= 15; - } - stbiw__jpg_calcBits(DU[i], bits); - stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes << 4) + bits[1]]); - stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits); - } - if (end0pos != 63) { - stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB); - } - return DU[0]; -} - -static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, - int comp, const void *data, int quality) { - // Constants that don't pollute global namespace - static const unsigned char std_dc_luminance_nrcodes[] = { - 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0}; - static const unsigned char std_dc_luminance_values[] = {0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11}; - static const unsigned char std_ac_luminance_nrcodes[] = { - 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d}; - static const unsigned char std_ac_luminance_values[] = { - 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06, - 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08, - 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, - 0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, - 0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45, - 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, - 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75, - 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, - 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, - 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, - 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, - 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2, - 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, - 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa}; - static const unsigned char std_dc_chrominance_nrcodes[] = { - 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0}; - static const unsigned char std_dc_chrominance_values[] = {0, 1, 2, 3, 4, 5, - 6, 7, 8, 9, 10, 11}; - static const unsigned char std_ac_chrominance_nrcodes[] = { - 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77}; - static const unsigned char std_ac_chrominance_values[] = { - 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41, - 0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91, - 0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1, - 0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26, - 0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, - 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, - 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, - 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, - 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, - 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, - 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, - 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, - 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4, - 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa}; - // Huffman tables - static const unsigned short YDC_HT[256][2] = { - {0, 2}, {2, 3}, {3, 3}, {4, 3}, {5, 3}, {6, 3}, - {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}, {510, 9}}; - static const unsigned short UVDC_HT[256][2] = { - {0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, - {62, 6}, {126, 7}, {254, 8}, {510, 9}, {1022, 10}, {2046, 11}}; - static const unsigned short YAC_HT[256][2] = { - {10, 4}, {0, 2}, {1, 2}, {4, 3}, {11, 4}, - {26, 5}, {120, 7}, {248, 8}, {1014, 10}, {65410, 16}, - {65411, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {12, 4}, {27, 5}, {121, 7}, - {502, 9}, {2038, 11}, {65412, 16}, {65413, 16}, {65414, 16}, - {65415, 16}, {65416, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {28, 5}, {249, 8}, - {1015, 10}, {4084, 12}, {65417, 16}, {65418, 16}, {65419, 16}, - {65420, 16}, {65421, 16}, {65422, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {58, 6}, - {503, 9}, {4085, 12}, {65423, 16}, {65424, 16}, {65425, 16}, - {65426, 16}, {65427, 16}, {65428, 16}, {65429, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {59, 6}, {1016, 10}, {65430, 16}, {65431, 16}, {65432, 16}, - {65433, 16}, {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {122, 7}, {2039, 11}, {65438, 16}, {65439, 16}, - {65440, 16}, {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, - {65445, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {123, 7}, {4086, 12}, {65446, 16}, - {65447, 16}, {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, - {65452, 16}, {65453, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {250, 8}, {4087, 12}, - {65454, 16}, {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, - {65459, 16}, {65460, 16}, {65461, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {504, 9}, - {32704, 15}, {65462, 16}, {65463, 16}, {65464, 16}, {65465, 16}, - {65466, 16}, {65467, 16}, {65468, 16}, {65469, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {505, 9}, {65470, 16}, {65471, 16}, {65472, 16}, {65473, 16}, - {65474, 16}, {65475, 16}, {65476, 16}, {65477, 16}, {65478, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {506, 9}, {65479, 16}, {65480, 16}, {65481, 16}, - {65482, 16}, {65483, 16}, {65484, 16}, {65485, 16}, {65486, 16}, - {65487, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {1017, 10}, {65488, 16}, {65489, 16}, - {65490, 16}, {65491, 16}, {65492, 16}, {65493, 16}, {65494, 16}, - {65495, 16}, {65496, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {1018, 10}, {65497, 16}, - {65498, 16}, {65499, 16}, {65500, 16}, {65501, 16}, {65502, 16}, - {65503, 16}, {65504, 16}, {65505, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2040, 11}, - {65506, 16}, {65507, 16}, {65508, 16}, {65509, 16}, {65510, 16}, - {65511, 16}, {65512, 16}, {65513, 16}, {65514, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {65515, 16}, {65516, 16}, {65517, 16}, {65518, 16}, {65519, 16}, - {65520, 16}, {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {2041, 11}, {65525, 16}, {65526, 16}, {65527, 16}, {65528, 16}, - {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, - {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}}; - static const unsigned short UVAC_HT[256][2] = { - {0, 2}, {1, 2}, {4, 3}, {10, 4}, {24, 5}, - {25, 5}, {56, 6}, {120, 7}, {500, 9}, {1014, 10}, - {4084, 12}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {11, 4}, {57, 6}, {246, 8}, - {501, 9}, {2038, 11}, {4085, 12}, {65416, 16}, {65417, 16}, - {65418, 16}, {65419, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {26, 5}, {247, 8}, - {1015, 10}, {4086, 12}, {32706, 15}, {65420, 16}, {65421, 16}, - {65422, 16}, {65423, 16}, {65424, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {27, 5}, - {248, 8}, {1016, 10}, {4087, 12}, {65425, 16}, {65426, 16}, - {65427, 16}, {65428, 16}, {65429, 16}, {65430, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {58, 6}, {502, 9}, {65431, 16}, {65432, 16}, {65433, 16}, - {65434, 16}, {65435, 16}, {65436, 16}, {65437, 16}, {65438, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {59, 6}, {1017, 10}, {65439, 16}, {65440, 16}, - {65441, 16}, {65442, 16}, {65443, 16}, {65444, 16}, {65445, 16}, - {65446, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {121, 7}, {2039, 11}, {65447, 16}, - {65448, 16}, {65449, 16}, {65450, 16}, {65451, 16}, {65452, 16}, - {65453, 16}, {65454, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {122, 7}, {2040, 11}, - {65455, 16}, {65456, 16}, {65457, 16}, {65458, 16}, {65459, 16}, - {65460, 16}, {65461, 16}, {65462, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {249, 8}, - {65463, 16}, {65464, 16}, {65465, 16}, {65466, 16}, {65467, 16}, - {65468, 16}, {65469, 16}, {65470, 16}, {65471, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {503, 9}, {65472, 16}, {65473, 16}, {65474, 16}, {65475, 16}, - {65476, 16}, {65477, 16}, {65478, 16}, {65479, 16}, {65480, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {504, 9}, {65481, 16}, {65482, 16}, {65483, 16}, - {65484, 16}, {65485, 16}, {65486, 16}, {65487, 16}, {65488, 16}, - {65489, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {505, 9}, {65490, 16}, {65491, 16}, - {65492, 16}, {65493, 16}, {65494, 16}, {65495, 16}, {65496, 16}, - {65497, 16}, {65498, 16}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {506, 9}, {65499, 16}, - {65500, 16}, {65501, 16}, {65502, 16}, {65503, 16}, {65504, 16}, - {65505, 16}, {65506, 16}, {65507, 16}, {0, 0}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {2041, 11}, - {65508, 16}, {65509, 16}, {65510, 16}, {65511, 16}, {65512, 16}, - {65513, 16}, {65514, 16}, {65515, 16}, {65516, 16}, {0, 0}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {16352, 14}, {65517, 16}, {65518, 16}, {65519, 16}, {65520, 16}, - {65521, 16}, {65522, 16}, {65523, 16}, {65524, 16}, {65525, 16}, - {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {1018, 10}, {32707, 15}, {65526, 16}, {65527, 16}, {65528, 16}, - {65529, 16}, {65530, 16}, {65531, 16}, {65532, 16}, {65533, 16}, - {65534, 16}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, - {0, 0}}; - static const int YQT[] = { - 16, 11, 10, 16, 24, 40, 51, 61, 12, 12, 14, 19, 26, 58, 60, 55, - 14, 13, 16, 24, 40, 57, 69, 56, 14, 17, 22, 29, 51, 87, 80, 62, - 18, 22, 37, 56, 68, 109, 103, 77, 24, 35, 55, 64, 81, 104, 113, 92, - 49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99}; - static const int UVQT[] = {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, - 99, 99, 99, 24, 26, 56, 99, 99, 99, 99, 99, 47, 66, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}; - static const float aasf[] = { - 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, - 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, - 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, - 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f}; - - int row, col, i, k; - float fdtbl_Y[64], fdtbl_UV[64]; - unsigned char YTable[64], UVTable[64]; - - if (!data || !width || !height || comp > 4 || comp < 1) { - return 0; - } - - quality = quality ? quality : 90; - quality = quality < 1 ? 1 : quality > 100 ? 100 : quality; - quality = quality < 50 ? 5000 / quality : 200 - quality * 2; - - for (i = 0; i < 64; ++i) { - int uvti, yti = (YQT[i] * quality + 50) / 100; - YTable[stbiw__jpg_ZigZag[i]] = - (unsigned char)(yti < 1 ? 1 : yti > 255 ? 255 : yti); - uvti = (UVQT[i] * quality + 50) / 100; - UVTable[stbiw__jpg_ZigZag[i]] = - (unsigned char)(uvti < 1 ? 1 : uvti > 255 ? 255 : uvti); - } - - for (row = 0, k = 0; row < 8; ++row) { - for (col = 0; col < 8; ++col, ++k) { - fdtbl_Y[k] = 1 / (YTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); - fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]); - } - } - - // Write Headers - { - static const unsigned char head0[] = { - 0xFF, 0xD8, 0xFF, 0xE0, 0, 0x10, 'J', 'F', 'I', 'F', 0, 1, 1, - 0, 0, 1, 0, 1, 0, 0, 0xFF, 0xDB, 0, 0x84, 0}; - static const unsigned char head2[] = {0xFF, 0xDA, 0, 0xC, 3, 1, 0, - 2, 0x11, 3, 0x11, 0, 0x3F, 0}; - const unsigned char head1[] = {0xFF, - 0xC0, - 0, - 0x11, - 8, - (unsigned char)(height >> 8), - STBIW_UCHAR(height), - (unsigned char)(width >> 8), - STBIW_UCHAR(width), - 3, - 1, - 0x11, - 0, - 2, - 0x11, - 1, - 3, - 0x11, - 1, - 0xFF, - 0xC4, - 0x01, - 0xA2, - 0}; - s->func(s->context, (void *)head0, sizeof(head0)); - s->func(s->context, (void *)YTable, sizeof(YTable)); - stbiw__putc(s, 1); - s->func(s->context, UVTable, sizeof(UVTable)); - s->func(s->context, (void *)head1, sizeof(head1)); - s->func(s->context, (void *)(std_dc_luminance_nrcodes + 1), - sizeof(std_dc_luminance_nrcodes) - 1); - s->func(s->context, (void *)std_dc_luminance_values, - sizeof(std_dc_luminance_values)); - stbiw__putc(s, 0x10); // HTYACinfo - s->func(s->context, (void *)(std_ac_luminance_nrcodes + 1), - sizeof(std_ac_luminance_nrcodes) - 1); - s->func(s->context, (void *)std_ac_luminance_values, - sizeof(std_ac_luminance_values)); - stbiw__putc(s, 1); // HTUDCinfo - s->func(s->context, (void *)(std_dc_chrominance_nrcodes + 1), - sizeof(std_dc_chrominance_nrcodes) - 1); - s->func(s->context, (void *)std_dc_chrominance_values, - sizeof(std_dc_chrominance_values)); - stbiw__putc(s, 0x11); // HTUACinfo - s->func(s->context, (void *)(std_ac_chrominance_nrcodes + 1), - sizeof(std_ac_chrominance_nrcodes) - 1); - s->func(s->context, (void *)std_ac_chrominance_values, - sizeof(std_ac_chrominance_values)); - s->func(s->context, (void *)head2, sizeof(head2)); - } - - // Encode 8x8 macroblocks - { - static const unsigned short fillBits[] = {0x7F, 7}; - const unsigned char *imageData = (const unsigned char *)data; - int DCY = 0, DCU = 0, DCV = 0; - int bitBuf = 0, bitCnt = 0; - // comp == 2 is grey+alpha (alpha is ignored) - int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0; - int x, y, pos; - for (y = 0; y < height; y += 8) { - for (x = 0; x < width; x += 8) { - float YDU[64], UDU[64], VDU[64]; - for (row = y, pos = 0; row < y + 8; ++row) { - // row >= height => use last input row - int clamped_row = (row < height) ? row : height - 1; - int base_p = - (stbi__flip_vertically_on_write ? (height - 1 - clamped_row) - : clamped_row) * - width * comp; - for (col = x; col < x + 8; ++col, ++pos) { - float r, g, b; - // if col >= width => use pixel from last input column - int p = base_p + ((col < width) ? col : (width - 1)) * comp; - - r = imageData[p + 0]; - g = imageData[p + ofsG]; - b = imageData[p + ofsB]; - YDU[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128; - UDU[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b; - VDU[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b; - } - } - - DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, - YDC_HT, YAC_HT); - DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, - UVDC_HT, UVAC_HT); - DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, - UVDC_HT, UVAC_HT); - } - } - - // Do the bit alignment of the EOI marker - stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits); - } - - // EOI - stbiw__putc(s, 0xFF); - stbiw__putc(s, 0xD9); - - return 1; -} - -STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, - int y, int comp, const void *data, - int quality) { - stbi__write_context s; - stbi__start_write_callbacks(&s, func, context); - return stbi_write_jpg_core(&s, x, y, comp, (void *)data, quality); -} - -#ifndef STBI_WRITE_NO_STDIO -STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, - const void *data, int quality) { - stbi__write_context s; - if (stbi__start_write_file(&s, filename)) { - int r = stbi_write_jpg_core(&s, x, y, comp, data, quality); - stbi__end_write_file(&s); - return r; - } else - return 0; -} -#endif - -#endif // STB_IMAGE_WRITE_IMPLEMENTATION - -/* Revision history - 1.11 (2019-08-11) - - 1.10 (2019-02-07) - support utf8 filenames in Windows; fix warnings and platform ifdefs - 1.09 (2018-02-11) - fix typo in zlib quality API, improve STB_I_W_STATIC in C++ - 1.08 (2018-01-29) - add stbi__flip_vertically_on_write, external zlib, zlib quality, - choose PNG filter 1.07 (2017-07-24) doc fix 1.06 (2017-07-23) writing JPEG - (using Jon Olick's code) 1.05 ??? 1.04 (2017-03-03) monochrome BMP - expansion 1.03 ??? 1.02 (2016-04-02) avoid allocating large structures on - the stack 1.01 (2016-01-16) STBIW_REALLOC_SIZED: support allocators with no - realloc support avoid race-condition in crc initialization minor compile - issues 1.00 (2015-09-14) installable file IO function 0.99 (2015-09-13) - warning fixes; TGA rle support - 0.98 (2015-04-08) - added STBIW_MALLOC, STBIW_ASSERT etc - 0.97 (2015-01-18) - fixed HDR asserts, rewrote HDR rle logic - 0.96 (2015-01-17) - add HDR output - fix monochrome BMP - 0.95 (2014-08-17) - add monochrome TGA output - 0.94 (2014-05-31) - rename private functions to avoid conflicts with stb_image.h - 0.93 (2014-05-27) - warning fixes - 0.92 (2010-08-01) - casts to unsigned char to fix warnings - 0.91 (2010-07-17) - first public release - 0.90 first internal release -*/ - -/* ------------------------------------------------------------------------------- -This software is available under 2 licenses -- choose whichever you prefer. ------------------------------------------------------------------------------- -ALTERNATIVE A - MIT License -Copyright (c) 2017 Sean Barrett -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. ------------------------------------------------------------------------------- -ALTERNATIVE B - Public Domain (www.unlicense.org) -This is free and unencumbered software released into the public domain. -Anyone is free to copy, modify, publish, use, compile, sell, or distribute this -software, either in source code form or as a compiled binary, for any purpose, -commercial or non-commercial, and by any means. -In jurisdictions that recognize copyright laws, the author or authors of this -software dedicate any and all copyright interest in the software to the public -domain. We make this dedication for the benefit of the public at large and to -the detriment of our heirs and successors. We intend this dedication to be an -overt act of relinquishment in perpetuity of all present and future rights to -this software under copyright law. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ------------------------------------------------------------------------------- -*/ diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h deleted file mode 100644 index 52f08730620945d3559c58d26051e81437996eac..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef IMG_TENSOR_RUNTIME_H -#define IMG_TENSOR_RUNTIME_H - -#include "device_math.h" -#include "img_tensor_utils.h" -#include <cstddef> - -// *** Runtime declaration *** // -void *tensorFft(void *input, bool inverse); -void *tensorFftHalf(void *input, bool inverse); -void *tensorReduce(void *input, size_t axis, MathOp func, - float skip_ratio = 0.0f); -void *tensorReduceHalf(void *input, size_t axis, MathOp func, - float skip_ratio = 0.0f); -void *tensorProjectiveT(void *input, void *transformation); -void *tensorMap1(MathOp f, void *i); -void *tensorMap2(MathOp f2, void *i1, void *i2); -void *tensorMap3(MathOp f3, void *i1, void *i2, void *i3); -void *tensorMap1Half(MathOp f, void *i); -void *tensorMap2Half(MathOp f2, void *i1, void *i2); -void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3); - -// *** Wrapper API declaration *** // -extern "C" { -void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse); -void *wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis, - int func); -void *wrapper_tensorProjectiveT(const char *hpvm_node_id, void *input, - void *transformation); -void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input); -void *wrapper_tensorMap2(const char *hpvm_node_id, int func, void *input1, - void *input2); -void *wrapper_tensorMap3(const char *hpvm_node_id, int func, void *input1, - void *input2, void *input3); - -// Tentative -void *wrapper_tensorStencil(const char *hpvm_node_id, void *input); -void *wrapper_tensorCosineT(const char *hpvm_node_id, void *input); -} - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h deleted file mode 100644 index 5dc3fe3dbc3cec9ea81fa33bc56471e2d6daaae5..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h +++ /dev/null @@ -1,44 +0,0 @@ -/* -img_tensor_utils.h -Util functions for image load/save, image quality calculation (PSNR), etc. -*/ -#ifndef IMG_TENSOR_UTILS -#define IMG_TENSOR_UTILS - -#include <string> -#include <vector> - -#include "tensor.h" - -const size_t N_RGB_CHAN = 3; - -// Loader constructor -void *loadAsImage(const char *filename, size_t n_color = N_RGB_CHAN); - -void saveToImage(const char *filename, Tensor *tensor); - -Tensor *readDataSet(const char *path, size_t start = 0, - size_t count = std::string::npos, - size_t n_color = N_RGB_CHAN); - -void saveDataSet(const char *path, Tensor *batch, size_t start_idx = 0, - size_t write_n = 0); - -// Kernel constructor -void *createFilterFromData(int data_type, void *data, size_t w, size_t h, - size_t n_chan); - -std::vector<float> PSNR(void *gold_ptr, void *approx_ptr); - -float violationRate(const std::vector<float> &values, float threshold, - bool higher_better = true); - -float mean(const std::vector<float> &values); - -std::vector<float> SSIM(void *lhs_ptr, void *rhs_ptr); - -void *sliceTensorInBatch(void *whole, size_t start, size_t end); - -void reshape(void *t, const std::vector<size_t> &shape); - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h index f2c732cb2743daebadec4fddc2ad88d799959dbb..a766c02d6cc724fe91e4ef581871497cfddee788 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h @@ -4,5 +4,4 @@ void llvm_hpvm_initializeRuntimeController(const char *); void llvm_hpvm_clearRuntimeController(); void llvm_hpvm_invokeRtControl(void *result, const char *str, int start, int end); -void llvm_hpvm_imgInvokeRtControl(void *result, void *gold, int start, int end); } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h index 4bb703bbd2596980fb4d930b36aaa749c7144044..d070d7755c1f5982c2c9fabf1acdca83bd446870 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -1,122 +1,79 @@ -//===--------------------------- tensor_cpu_runtime.h -----------------------===// +//===--------------------------- tensor_cpu_runtime.h +//-----------------------===// // //===----------------------------------------------------------------------===// -// +// // This header file comprises of the API to the tensor routines for CPU. // This also contains the interfaces to the approximated versions of tensor // operations that are supported on CPU. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <cstdlib> #include <cmath> #include <memory> #include <string> - #ifndef TENSOR_CPU_HEADER #define TENSOR_CPU_HEADER +extern "C" { +/**** Initialization Routine - Must be inserted at program start (in the + * backend) ****/ +void llvm_hpvm_initTensorRtCPU(); +void llvm_hpvm_cleanupTensorRtCPU(); -extern "C"{ - /**** Initialization Routine - Must be inserted at program start (in the backend) ****/ - void llvm_hpvm_initTensorRtCPU(); - void llvm_hpvm_cleanupTensorRtCPU(); +// Routine to moving tensor data (from and to GPU,CPU) +void hpvm_request_tensorCPU(void *tensor, int destination); - // Routine to moving tensor data (from and to GPU,CPU) - void hpvm_request_tensorCPU(void* tensor, int destination); +// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for +// cuDNN operations NOTE: The only data format supported as of now is: NCHW +// (batch_dimension, channels, Height, Width) +// void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t +// dim2_size, +/// size_t dim3_size, size_t dim4_size, bool freeMemory = true); +void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes); - // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations - // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) - //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, - /// size_t dim3_size, size_t dim4_size, bool freeMemory = true); - - void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); +/********** Tensor Operation API ******/ - /********** Tensor Operation API ******/ +// NOTE: For conv_mode, only value '1' is supported +void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start); - // NOTE: For conv_mode, only value '1' is supported -void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start); +void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start); -void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start); +void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups); -void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups); - - void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon); +void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon); +void *tensorPoolingCPU(void *input, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride); - void* tensorPoolingCPU(void* input, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride); +void *tensorGemmCPU(void *lhs, void *rhs); - void* tensorGemmCPU(void* lhs, void* rhs); +void *tensorAddCPU(void *x, void *bias); - void* tensorAddCPU(void* x, void* bias); +void *tensorReluCPU(void *input); - void* tensorReluCPU(void* input); - - void* tensorRelu2CPU(void* input, float min, float max); - - void* tensorTanhCPU(void* input); - - void* tensorSoftmaxCPU(void* input); - -} +void *tensorRelu2CPU(void *input, float min, float max); +void *tensorTanhCPU(void *input); -/* -void dummyFunction(){ - - void* initRT = (void*) &llvm_hpvm_initTensorRt; - void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt; - void* request_tensorPtr = (void*) &hpvm_request_tensor; - void* startProf = (void*) &startProfiling; - void* stopProf = (void*) &stopProfiling; - void* create2Dptr = (void*) &create2DTensor; - void* create3Dptr = (void*) &create3DTensor; - void* create4Dptr = (void*) &create4DTensor; - void* initTensorPtr = (void*) &initTensorData; - void* tensorSplitPtr = (void*) &tensorSplit; - void* tensorConcatPtr = (void*) &tensorConcat; - void* tensorConvPtr = (void*) &tensorConvolution; - void* tensorHConvPtr = (void*) &tensorHalfConvolution; - void* tensorPoolPtr = (void*) &tensorPooling; - void* tensorHalfPoolPtr = (void*) &tensorHalfPooling; - void* tensorLRNPtr = (void*) &tensorLRN; - void* tensorGemmPr = (void*) &tensorGemm; - void* tensorGemmCPUPtr = (void*) &tensorGemmCPU; - void* tensorGemmGPUPtr = (void*) &tensorGemmGPU; - void* tensorHgemmPtr = (void*) &tensorHalfGemm; - void* tensorGemmBiasPtr = (void*) &tensorGemmBias; - void* tensorAddPtr = (void*) &tensorAdd; - void* tensorHalfAddPtr = (void*) &tensorHalfAdd; - void* tensorReluPtr = (void*) &tensorRelu; - //FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu; - void* tensorRelu2Ptr = (void*) &tensorRelu2; - void* tensorHalfRelu2Ptr = (void*) &tensorHalfRelu2; - void* tensorTanhPtr = (void*) &tensorTanh; - void* tensorHalfTanhPtr = (void*) &tensorHalfTanh; - void* tensorSoftmaxPtr = (void*) &tensorSoftmax; - void* tensorAddErrorPtr = (void*) &tensorAddError; +void *tensorSoftmaxCPU(void *input); } -*/ - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc index 0de2808221adfb122860a031eea4ed8c89d6e2ba..083d733b14d4f335f4365503e5ecdb5c3de0a2fb 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.cc @@ -69,37 +69,6 @@ void *tensorAdd(void *x, void *bias); void *tensorRelu(void *input); // NOTE: In-place operation void *tensorSoftmax(void *input); - -/* Error injection API - used for accuracy tuning */ -void *tensorAddError(void *x_ptr); -} - -void emptyFunction() { - - void *initRT = (void *)&llvm_hpvm_initTensorRt; - void *cleanRT = (void *)&llvm_hpvm_cleanupTensorRt; - void *request_tensorPtr = (void *)&hpvm_request_tensor; - void *startProf = (void *)&startProfiling; - void *stopProf = (void *)&stopProfiling; - void *create2Dptr = (void *)&create2DTensor; - void *create3Dptr = (void *)&create3DTensor; - void *create4Dptr = (void *)&create4DTensor; - void *initTensorPtr = (void *)&initTensorData; - void *tensorSplitPtr = (void *)&tensorSplit; - void *tensorConcatPtr = (void *)&tensorConcat; - void *tensorConvPtr = (void *)&tensorConvolution; - void *tensorHConvPtr = (void *)&tensorHConvolution; - void *tensorPoolPtr = (void *)&tensorPooling; - void *tensorLRNPtr = (void *)&tensorLRN; - void *tensorGemmPr = (void *)&tensorGemm; - void *tensorGemmCPUPtr = (void *)&tensorGemmCPU; - void *tensorGemmGPUPtr = (void *)&tensorGemmGPU; - void *tensorHgemmPtr = (void *)&tensorHgemm; - void *tensorGemmBiasPtr = (void *)&tensorGemmBias; - void *tensorAddPtr = (void *)&tensorAdd; - void *tensorReluPtr = (void *)&tensorRelu; - void *tensorSoftmaxPtr = (void *)&tensorSoftmax; - void *tensorAddErrorPtr = (void *)&tensorAddError; } #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h index b6d7f862fa60973e650c3b4306df61e89d28eb30..1b6e986a47324ab0ab663fc8e1e5171b07c135cf 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h @@ -12,8 +12,6 @@ #include <stdio.h> #include <string> -#include "img_tensor_runtime.h" - extern "C" { /**** Initialization Routine - Must be inserted at program start (in the * backend) ****/ @@ -111,13 +109,6 @@ void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, void *variance_ptr, double epsilon); -/* Error injection API - used for accuracy tuning */ -void *tensorAddError(void *x_ptr, int error_scale); - -void *tensorGemmModel(void *lhs, void *rhs); - -/*** Error Injection API End **/ - /**** PROMISE API *****/ /************* @@ -168,22 +159,14 @@ void *wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, int activation_id, // Relu, Tanh, ClipRelu float out_min, float out_max); +void *wrapper_ConvLayer2( + const char *hpvm_node_id, void *input, void *filter, void *bias, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, + int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max); -void* wrapper_ConvLayer2(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, - int pool_size_v, int pool_size_h, - int pool_pad_v, int pool_pad_h, - int pool_stride_v, int pool_stride_h, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max); - - void *wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, void *bias, int activation_id, float out_min, float out_max); @@ -213,11 +196,8 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr, void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr); - void *tensor_set_node_id(unsigned int node_id); - - - + // Utilities // TODO: separate utils in separate header void dumpAccuracyNorms(); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc index 323adbac8940ed83c51d3729565c1bda3dbf35cc..a3853fda533aa4668963826eb646f009aae02695 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc @@ -1,14 +1,14 @@ -//===--------------------------- tensor_signatures.cc -----------------------===// +//===--------------------------- tensor_signatures.cc +//-----------------------===// // //===----------------------------------------------------------------------===// -// +// // This file contains the declarations of the API to the HPVM tensor runtime. // This is compiled to LLVM bitcode file that is loaded by HPVM passes when // tensor-based application are compiled through HPVM. // //===----------------------------------------------------------------------===// - #include "tensor_runtime.h" void dummyFunction() { @@ -51,7 +51,6 @@ void dummyFunction() { void *tensorHalfTanhPtr = (void *)&tensorHalfTanh; void *tensorSoftmaxPtr = (void *)&tensorSoftmax; void *tensorBatchNormPtr = (void *)&tensorBatchNorm; - void *tensorAddErrorPtr = (void *)&tensorAddError; void *ConvLayer = (void *)&ConvLayer_PROMISE; void *FCLayer = (void *)&FCLayer_PROMISE; @@ -67,14 +66,5 @@ void dummyFunction() { void *PoolingWrapper = (void *)&wrapper_tensorPooling; void *softmaxWrapper = (void *)&wrapper_tensorSoftmax; - void *tensorFft = (void *)&wrapper_tensorFft; - void *tensorReduce = (void *)&wrapper_tensorReduce; - void *tensorProjectiveT = (void *)&wrapper_tensorProjectiveT; - void *tensorMap1 = (void *)&wrapper_tensorMap1; - void *tensorMap2 = (void *)&wrapper_tensorMap2; - void *tensorMap3 = (void *)&wrapper_tensorMap3; - void *tensorStencil = (void *)&wrapper_tensorStencil; - void *tensorCosineT = (void *)&wrapper_tensorCosineT; - void *tensorNodeID = (void *)&tensor_set_node_id; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc index b272bbcab45573f03ac17305f86a99e630db2950..a0ca6f5bb0632b592b6cc6b09c9cd6068319b954 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc @@ -27,17 +27,17 @@ PerfParamSet::PerfParamSet() { printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE); std::ifstream file(GLOBAL_KNOBS_FILE); - if (!file){ + if (!file) { ERROR(" Could NOT find global_knobs.txt \n"); } - + std::string line; std::string partial; std::vector<std::string> tokens; while (std::getline(file, line)) { // Read each line - //printf ("***** line === %s ", line); + // printf ("***** line === %s ", line); std::istringstream iss(line); std::string token; while (std::getline(iss, token, '\t')) { // Read each token in the line @@ -64,7 +64,7 @@ PerfParamSet::PerfParamSet() { std::getline(token_stream, tok, ','); int offset = atoi(tok.c_str()); - //printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob, + // printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob, // row, col, offset); PerfParams params(row, col, offset); perf_knob_map[knob] = params; @@ -101,10 +101,10 @@ SampParamSet::SampParamSet() { printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE); std::ifstream file(GLOBAL_KNOBS_FILE); - if (!file){ + if (!file) { ERROR("Could NOT find global_knobs.txt \n"); } - + std::string line; std::string partial; std::vector<std::string> tokens; @@ -124,7 +124,7 @@ SampParamSet::SampParamSet() { int index2 = token.find(","); std::string knob_str = token.substr(index2 + 1); int knob = atoi(knob_str.c_str()); - //printf("knob = %d \n", knob); + // printf("knob = %d \n", knob); std::getline(iss, token, '\t'); std::istringstream token_stream(token); @@ -140,7 +140,7 @@ SampParamSet::SampParamSet() { std::getline(token_stream, tok, ','); float interpolation_id = atof(tok.c_str()); - //printf("skip_every = %d, offset = %d \n", skip_every, offset); + // printf("skip_every = %d, offset = %d \n", skip_every, offset); SampParams params(skip_every, offset, interpolation_id); samp_knob_map[knob] = params; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu index e9a4e50b000918c328a8b693f39c04505b6e4b79..8a8ff8435db96607917fc627036e72318409ef9b 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu @@ -1,14 +1,13 @@ //===--------------------------- approxs_simulator.cu ---------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the emulations of implementation of software -// approximations for tensor convolutions. The approximations implemented are -// feature sampling and perforation for FP32 and FP16 compute precisions. +// +// This file consists of the emulations of implementation of software +// approximations for tensor convolutions. The approximations implemented are +// feature sampling and perforation for FP32 and FP16 compute precisions. // //===----------------------------------------------------------------------===// - #ifndef SIM_HEADER #define SIM_HEADER @@ -27,7 +26,6 @@ #include "global_data.h" #include "approx_knob_utils.h" - #include <unordered_map> #include <sstream> #include <fstream> @@ -36,77 +34,67 @@ #include <map> #include <cassert> - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void postInterpolateRow(int N, int n, int c, int h, int w, - float* data, int int_row){ +// N is new_data's size +// n, c, h, w are the dimensions of new_data +__global__ void postInterpolateRow(int N, int n, int c, int h, int w, + float *data, int int_row) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); int n = i / (c * h * w); - if((row % int_row == 1) && (row != 0) && (row != h-1)) + if ((row % int_row == 1) && (row != 0) && (row != h - 1)) data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] + - data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) / 2; - + (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] + + data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) / + 2; } } - - -__global__ -void postInterpolateCol(int N, int n, int c, int h, int w, - float* data, int int_col){ +__global__ void postInterpolateCol(int N, int n, int c, int h, int w, + float *data, int int_col) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); int n = i / (c * h * w); - if((col % int_col == 1) && (col != 0) && (col != w-1)) + if ((col % int_col == 1) && (col != 0) && (col != w - 1)) data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col-1) ] + - data[n * (c * h * w) + ch * (h * w) + row * (w) + (col+1) ])/2; - + (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col - 1)] + + data[n * (c * h * w) + ch * (h * w) + row * (w) + (col + 1)]) / + 2; } } - - - // A 'Simulation' of perforated tensor convolution -void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col){ - +void *tensorConvPerfSim(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -114,13 +102,13 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -130,134 +118,111 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - if(input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if(input->data_format == CUDNN_TENSOR_NHWC){ + Tensor *output; + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // FIXIT: Algo shouldn't be hardcoded convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); h = (2 * vertical_pad + input->dims.dim_sizes[2] - - filter->dims.dim_sizes[2]) / vertical_stride + 1; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - - filter->dims.dim_sizes[3]) / horizontal_stride + 1; + filter->dims.dim_sizes[2]) / + vertical_stride + + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - + filter->dims.dim_sizes[3]) / + horizontal_stride + + 1; - int numBlocks = (n * c * h * w + 127) / 128; + int numBlocks = (n * c * h * w + 127) / 128; if (row > 0) - postInterpolateRow<<<numBlocks,128>>>(n * c * h * w, n, c, h, w, - (float *) output->gpu_data, row); + postInterpolateRow<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, row); if (col > 0) - postInterpolateCol<<<numBlocks,128>>>(n * c * h * w, n, c, h, w, - (float *) output->gpu_data, col); - + postInterpolateCol<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, col); profileEvent("tensorConv_end", true); return output; } - - - - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void sampleFilterElems(int N, - int n, int c, int h, int w, - float* data, - int skip_elem, int skip_offset, - float mul_factor, - float* newData){ +// N is new_data's size +// n, c, h, w are the dimensions of new_data +__global__ void sampleFilterElems(int N, int n, int c, int h, int w, + float *data, int skip_elem, int skip_offset, + float mul_factor, float *newData) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); @@ -265,75 +230,60 @@ void sampleFilterElems(int N, int local_index = (ch * (h * w)) + (row * w) + col; - if(skip_elem == 3 && h == 3 && w == 3){ + if (skip_elem == 3 && h == 3 && w == 3) { skip_offset = (skip_offset + ch) % w; // wrap around skip offsets } - if(local_index % skip_elem == skip_offset) - newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; + if (local_index % skip_elem == skip_offset) + newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; else newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor; - + data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor; } } - - - - -void sampleFilter(Tensor* newFilter, Tensor* filter, - int skip_rate, int skip_offset){ +void sampleFilter(Tensor *newFilter, Tensor *filter, int skip_rate, + int skip_offset) { int n = filter->dims.dim_sizes[0]; int c = filter->dims.dim_sizes[1]; int h = filter->dims.dim_sizes[2]; int w = filter->dims.dim_sizes[3]; - - int numBlocks = (n * c * h * w + 127) / 128; - int N = n * c * h * w; - float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); + int numBlocks = (n * c * h * w + 127) / 128; + int N = n * c * h * w; - //float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); - //mul_factor = (mul_factor + 1.0) / 2; + float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); - - DEBUG ("mul_factor = %f \n", mul_factor); + // float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); + // mul_factor = (mul_factor + 1.0) / 2; - - sampleFilterElems<<<numBlocks,128>>>(N, - n, c, h, w, - (float *) filter->gpu_data, - skip_rate, skip_offset, mul_factor, - (float *) newFilter->gpu_data); + DEBUG("mul_factor = %f \n", mul_factor); + sampleFilterElems<<<numBlocks, 128>>>( + N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset, + mul_factor, (float *)newFilter->gpu_data); } - - // A 'Simulation' of perforated tensor convolution -void* tensorConvSampSim(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int skip_rate, int skip_offset){ - +void *tensorConvSampSim(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int skip_rate, int skip_offset) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -344,24 +294,22 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - Tensor* newFilter; - newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], - filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], - filter->dims.dim_sizes[3]); - + Tensor *newFilter; + newFilter = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], + filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], + filter->dims.dim_sizes[3]); // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling sampleFilter(newFilter, filter, skip_rate, skip_offset); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -371,147 +319,116 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - + Tensor *output; + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Using GEMM-based Algo convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, newFilter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - - - freeTensor(newFilter); profileEvent("tensorConv_end", true); return output; } - - - - - - - - - -void sampleFilter2(Tensor* newFilter, Tensor* filter, - int skip_rate, int skip_offset, float interpolation_rate){ +void sampleFilter2(Tensor *newFilter, Tensor *filter, int skip_rate, + int skip_offset, float interpolation_rate) { int n = filter->dims.dim_sizes[0]; int c = filter->dims.dim_sizes[1]; int h = filter->dims.dim_sizes[2]; int w = filter->dims.dim_sizes[3]; - - int numBlocks = (n * c * h * w + 127) / 128; + + int numBlocks = (n * c * h * w + 127) / 128; int N = n * c * h * w; float mul_factor; mul_factor = (skip_rate * 1.0) / (skip_rate - 1); mul_factor = 1 + (interpolation_rate * (mul_factor - 1.0)); - DEBUG ("mul_factor = %f \n", mul_factor); - - sampleFilterElems<<<numBlocks,128>>>(N, - n, c, h, w, - (float *) filter->gpu_data, - skip_rate, skip_offset, mul_factor, - (float *) newFilter->gpu_data); -} - + DEBUG("mul_factor = %f \n", mul_factor); + sampleFilterElems<<<numBlocks, 128>>>( + N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset, + mul_factor, (float *)newFilter->gpu_data); +} // A 'Simulation' of perforated tensor convolution -void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int skip_rate, int skip_offset, float interpolation_rate){ - +void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int skip_rate, int skip_offset, + float interpolation_rate) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -522,24 +439,22 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - Tensor* newFilter; - newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], - filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], - filter->dims.dim_sizes[3]); - + Tensor *newFilter; + newFilter = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], + filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], + filter->dims.dim_sizes[3]); // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_rate); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -549,166 +464,135 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - + Tensor *output; + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Using GEMM-based Algo convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, newFilter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - - - freeTensor(newFilter); profileEvent("tensorConv_end", true); return output; } +/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ +void *PROMISE_Conv(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + Tensor *input_t = (Tensor *)input; + Tensor *filter_t = (Tensor *)filter; + Tensor *bias_t = (Tensor *)bias; - - - - - -/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ - - -void* PROMISE_Conv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - Tensor* input_t = (Tensor*) input; - Tensor* filter_t = (Tensor*) filter; - Tensor* bias_t = (Tensor*) bias; - int orig_type = input_t->cur_type; DEBUG("FP32 conversions \n"); - + convertToFP32(input_t); convertToFP32(filter_t); convertToFP32(bias_t); DEBUG("DONE FP32 conversions \n"); - - if(swing < 8){ + if (swing < 8) { input = quantizeTensorPromise(input, i_min, i_max); filter = quantizeTensorPromise(filter, w_min, w_max); - if(bias != NULL) + if (bias != NULL) bias = quantizeTensorPromise(bias, b_min, b_max); // aRead error - + input = addPromiseError(input, swing); } - - void* conv_out; - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); - - void* conv_add; - if(bias != NULL){ + void *conv_out; + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + + void *conv_add; + if (bias != NULL) { conv_add = tensorAdd(conv_out, bias); - } - else{ + } else { conv_add = conv_out; } - void* pool_out; + void *pool_out; // NOTE: Skip pooling on negative pool sizes - if(pool_size > 0){ - //FIXME: Currently only using MaxPooling - //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); - pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_stride, pool_stride); - } - else{ + if (pool_size > 0) { + // FIXME: Currently only using MaxPooling + //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, + // pool_size, pool_size); + pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); + } else { pool_out = conv_add; } - - void* activation_out; - switch(activation_id){ + + void *activation_out; + switch (activation_id) { case -1: activation_out = pool_out; INFO("NO Activation Function \n"); @@ -727,68 +611,54 @@ void* PROMISE_Conv(void* input, float i_min, float i_max, break; } - - if(swing < 8 && activation_id != -1){ + if (swing < 8 && activation_id != -1) { activation_out = quantizeTensorPromise(activation_out, out_min, out_max); } - - - //NOTE: Convert back to FP16 if original type - if (orig_type == half_type){ - convertToFP16((Tensor*) activation_out); + // NOTE: Convert back to FP16 if original type + if (orig_type == half_type) { + convertToFP16((Tensor *)activation_out); } - return activation_out; } +void *PROMISE_FC(void *input, float i_min, float i_max, void *weights, + float w_min, float w_max, void *bias, float b_min, float b_max, + int activation_id, float out_min, float out_max, int swing) { + Tensor *input_t = (Tensor *)input; + Tensor *weights_t = (Tensor *)weights; + Tensor *bias_t = (Tensor *)bias; -void* PROMISE_FC(void* input, float i_min, float i_max, - void* weights, float w_min, float w_max, - void* bias, float b_min, float b_max, - int activation_id, - float out_min, float out_max, int swing){ - - - Tensor* input_t = (Tensor*) input; - Tensor* weights_t = (Tensor*) weights; - Tensor* bias_t = (Tensor*) bias; - int orig_type = input_t->cur_type; - + convertToFP32(input_t); convertToFP32(weights_t); convertToFP32(bias_t); - - - if(swing < 8){ + + if (swing < 8) { input = quantizeTensorPromise(input, i_min, i_max); weights = quantizeTensorPromise(weights, w_min, w_max); - if(bias != NULL) + if (bias != NULL) bias = quantizeTensorPromise(bias, b_min, b_max); // NOTE: Modelling aRead error in PROMISE input = addPromiseError(input, swing); } - - - void* gemm_out; + void *gemm_out; gemm_out = tensorGemmGPU(input, weights); - - void* gemmbias_out; - if(bias != NULL){ + void *gemmbias_out; + if (bias != NULL) { gemmbias_out = tensorAdd(gemm_out, bias); - } - else{ + } else { gemmbias_out = gemm_out; } - - void* activation_out; - switch(activation_id){ + + void *activation_out; + switch (activation_id) { case -1: activation_out = gemmbias_out; @@ -807,86 +677,71 @@ void* PROMISE_FC(void* input, float i_min, float i_max, ERROR("Activation id %d NOT supported \n", activation_out); break; } - - - if(swing < 8 && activation_id != -1){ + + if (swing < 8 && activation_id != -1) { activation_out = quantizeTensorPromise(activation_out, out_min, out_max); } - - //NOTE: Convert back to FP16 if original type - if (orig_type == half_type){ - convertToFP16((Tensor*) activation_out); + // NOTE: Convert back to FP16 if original type + if (orig_type == half_type) { + convertToFP16((Tensor *)activation_out); } - - return activation_out; } - - - - -// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper +// NOTE: Enabling the macro below is used for testing against the old PROMISE +// wrapper //#define OLD_MODEL #ifndef OLD_MODEL +bool isPromiseLayer(int swing) { - -bool isPromiseLayer(int swing){ - - if(swing < 8) + if (swing < 8) return true; else - return false; + return false; } +bool isGPULayer(int swing) { -bool isGPULayer(int swing){ - - if(swing > 10 ) // PROMISE layers are 1-7 + if (swing > 10) // PROMISE layers are 1-7 return true; else - return false; + return false; } +bool isFullPrecision(int swing) { -bool isFullPrecision(int swing){ - - if(swing == 11) + if (swing == 11) return true; else - return false; + return false; } +bool isHalfPrecision(int swing) { - -bool isHalfPrecision(int swing){ - - if(swing == 12) + if (swing == 12) return true; else - return false; + return false; } +bool isPerforation(int swing) { -bool isPerforation(int swing){ - - if(swing >= 100 && swing <= 200) + if (swing >= 100 && swing <= 200) return true; else - return false; + return false; } +bool isSampling(int swing) { -bool isSampling(int swing){ - - if(swing >= 200 && swing <= 300) + if (swing >= 200 && swing <= 300) return true; else - return false; + return false; } bool isReductionSampling(int swing) { @@ -894,300 +749,227 @@ bool isReductionSampling(int swing) { if (swing >= 41 && swing <= 49) return true; else - return false; + return false; } -int getSwing(int swing){ +int getSwing(int swing) { - #ifdef PROMISE_TUNER_ENABLED +#ifdef PROMISE_TUNER_ENABLED // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime - if(!approxhpvm_runtime_mode){ - - if(op_counter >= total_ops){ + if (!approxhpvm_runtime_mode) { + + if (op_counter >= total_ops) { ERROR("No accuracy flag found \n"); } - + swing = op_accuracies[op_counter]; op_counter++; } - #endif +#endif - DEBUG("---- swing_value = %d \n", swing); + DEBUG("---- swing_value = %d \n", swing); - return swing; + return swing; } - - - -//bool FP16_tuning = false; - +// bool FP16_tuning = false; /***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */ - - -void initializeAutotuner(){ +void initializeAutotuner() { DEBUG("initializing tuner .... \n"); - + sampParamSet = new SampParamSet; - perfParamSet = new PerfParamSet; + perfParamSet = new PerfParamSet; } +void *Autotuner_SampConv(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { -void* Autotuner_SampConv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ + SampParams params = sampParamSet->getSampParams(swing); + DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", params.skip_rate, + params.skip_offset); + + void *conv_out; + + if (!FP16_tuning) { - SampParams params = sampParamSet->getSampParams(swing); - - DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", - params.skip_rate, params.skip_offset); - - void* conv_out; - - if (!FP16_tuning){ - /* conv_out = tensorConvSampSim(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.skip_rate, params.skip_offset); + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.skip_rate, params.skip_offset); */ - - if (SIMULATION_MODE){ - conv_out = tensorConvSampSim2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.skip_rate, params.skip_offset, params.interpolation_id); + if (SIMULATION_MODE) { + conv_out = tensorConvSampSim2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.skip_rate, params.skip_offset, params.interpolation_id); } - else { - conv_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - 1, 1, params.skip_rate, params.skip_offset); + conv_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); } - - - } - else{ - - conv_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - + + } else { + + conv_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); } return conv_out; } - - - -void* Autotuner_PerforatedConv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - +void *Autotuner_PerforatedConv(void *input, float i_min, float i_max, + void *filter, float w_min, float w_max, + void *bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { PerfParams params = perfParamSet->getPerfParams(swing); - + DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n", - params.row, params.col, params.skip_offset); - + params.row, params.col, params.skip_offset); - void* conv_out; - - if (!FP16_tuning){ + void *conv_out; + if (!FP16_tuning) { - if (SIMULATION_MODE){ + if (SIMULATION_MODE) { - conv_out = tensorConvPerfCuda(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.row, params.col, params.skip_offset); + conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.row, params.col, params.skip_offset); + } else { + + conv_out = tensorConvApprox( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); } - else{ - - conv_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, - 1, params.skip_offset); - } - - - } - else{ - conv_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, - 1, params.skip_offset); + } else { + conv_out = tensorConvApproxHalf2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, + 1, params.row, params.col, 1, params.skip_offset); } - - return conv_out; -} - - - + return conv_out; +} +void *Autotuner_ConvOp(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { -void* Autotuner_ConvOp(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ + void *conv_out; + if (isPerforation(swing)) { - - void* conv_out; - if(isPerforation(swing)){ + conv_out = Autotuner_PerforatedConv( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, + conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id, + pool_size, activation_id, out_min, out_max, swing); - conv_out = Autotuner_PerforatedConv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); - } - else if(isSampling(swing)){ + else if (isSampling(swing)) { - conv_out = Autotuner_SampConv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); + conv_out = Autotuner_SampConv( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, + conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id, + pool_size, activation_id, out_min, out_max, swing); } - - else if (isHalfPrecision(swing)){ + else if (isHalfPrecision(swing)) { - if (FP16_tuning){ - - conv_out = tensorHalfConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); - } - else{ - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); + if (FP16_tuning) { + + conv_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + } else { + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); } - - } - else if (isFullPrecision(swing)){ - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); } + else if (isFullPrecision(swing)) { + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + } - return conv_out; + return conv_out; } +void *Autotuner_Add(void *input, void *bias, int swing) { + void *conv_add; + if (bias != NULL) { -void* Autotuner_Add(void* input, void* bias, int swing){ - - void* conv_add; - if(bias != NULL){ - - if( isFullPrecision(swing) || !(FP16_tuning) ){ + if (isFullPrecision(swing) || !(FP16_tuning)) { conv_add = tensorAdd(input, bias); - } - else { + } else { conv_add = tensorHalfAdd(input, bias); } - } - else{ + } else { conv_add = input; } return conv_add; } +void *Autotuner_Pooling(void *input, int pool_size, int pool_stride, + int swing) { + void *pool_out; -void* Autotuner_Pooling(void* input, - int pool_size, int pool_stride, - int swing){ + if (pool_size > 0) { - void* pool_out; - - if(pool_size > 0){ - - //FIXME: Currently only using MaxPooling - if( isFullPrecision(swing) || !(FP16_tuning) ){ - pool_out = tensorPooling(input, 0, pool_size, pool_size, - 0, 0, pool_stride, pool_stride); + // FIXME: Currently only using MaxPooling + if (isFullPrecision(swing) || !(FP16_tuning)) { + pool_out = tensorPooling(input, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); } - + else { - pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, - 0, 0, pool_stride, pool_stride); + pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); } - - - } - else{ + + } else { pool_out = input; } - - + return pool_out; } +void *Autotuner_Activation(void *input, int activation_id, int out_min, + int out_max, int swing) { + void *activation_out; + if (isFullPrecision(swing) || (!FP16_tuning)) { -void* Autotuner_Activation(void* input, int activation_id, - int out_min, int out_max, int swing){ - - void* activation_out; - - if ( isFullPrecision(swing) || (!FP16_tuning) ){ - - switch(activation_id){ + switch (activation_id) { case -1: activation_out = input; INFO("NO Activation Function \n"); @@ -1206,10 +988,10 @@ void* Autotuner_Activation(void* input, int activation_id, break; } } - - else{ - switch(activation_id){ + else { + + switch (activation_id) { case -1: activation_out = input; INFO("NO Activation Function \n"); @@ -1227,310 +1009,116 @@ void* Autotuner_Activation(void* input, int activation_id, ERROR("Activation id %d NOT supported \n", activation_out); break; } - } - return activation_out; } +void *Autotuner_GPU_ConvLayer(void *input, float i_min, float i_max, + void *filter, float w_min, float w_max, + void *bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, int conv_stride_h, + int conv_stride_w, int pool_id, int pool_size, + int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { -void* autotuner_tensorFft(void *input, bool inverse) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorFft(input, inverse); - } - if (isHalfPrecision(swing)) { - return tensorFftHalf(input, inverse); - } + void *conv_out = Autotuner_ConvOp( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, + activation_id, out_min, out_max, swing); - ERROR("Unsupported autotuner flag for operation fft\n"); - abort(); - return NULL; -} + void *conv_add = Autotuner_Add(conv_out, bias, swing); + void *pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing); -void* autotuner_tensorReduce(void *input, size_t axis, MathOp func) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorReduce(input, axis, func, 0.0f); - } + void *activation_out = + Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing); - if (isHalfPrecision(swing)) { - return tensorReduceHalf(input, axis, func, 0.0f); - } - - if (isReductionSampling(swing)) { - RedSampParams params = getRedSampParams(swing); - DEBUG("params.skip_ratio = %f, params.is_half = %d\n", params.skip_ratio, (int)params.is_half); - if (params.is_half) - return tensorReduceHalf(input, axis, func, params.skip_ratio); - else - return tensorReduce(input, axis, func, params.skip_ratio); - } - - ERROR("Unsupported autotuner flag for operation reduce\n"); - abort(); - return NULL; -} - -void* autotuner_tensorProjectiveT(void *input, void *transformation) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorProjectiveT(input, transformation); - } - - ERROR("Unsupported autotuner flag for operation projectiveT\n"); - abort(); - return NULL; -} - - -void* autotuner_tensorMap1(MathOp func, void *input) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorMap1(func, input); - } - - if (isHalfPrecision(swing)) { - return tensorMap1Half(func, input); - } - - ERROR("Unsupported autotuner flag for operation map1\n"); - abort(); - return NULL; -} - -void* autotuner_tensorMap2(MathOp func, void *input1, void *input2) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorMap2(func, input1, input2); - } - - if (isHalfPrecision(swing)) { - return tensorMap2Half(func, input1, input2); - } - - ERROR("Unsupported autotuner flag for operation map2\n"); - abort(); - return NULL; -} - -void* autotuner_tensorMap3(MathOp func, void *input1, void *input2, - void *input3) { - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled\n"); - abort(); - } - - int swing = 0; - swing = getSwing(swing); - - if (isFullPrecision(swing)) { - return tensorMap3(func, input1, input2, input3); - } - - if (isHalfPrecision(swing)) { - return tensorMap3Half(func, input1, input2, input3); - } - - ERROR("Unsupported autotuner flag for operation map3\n"); - abort(); - return NULL; -} - - - - -void* Autotuner_GPU_ConvLayer(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - void* conv_out = Autotuner_ConvOp(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); - - - void* conv_add = Autotuner_Add(conv_out, bias, swing); - - void* pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing); - - void* activation_out = Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing); - - - return activation_out; + return activation_out; } - /**** Top-level API for Handling Convolution Layers The granularity of handling is at a layer-level - not tensor-op level - + ***/ -void* Autotuner_ConvLayer(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - if(FP16_tuning){ - if(ONLINE_PROFILING){ +void *Autotuner_ConvLayer(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + if (FP16_tuning) { + if (ONLINE_PROFILING) { ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n"); } } - swing = getSwing(swing); - - if(isPromiseLayer(swing)){ - - return PROMISE_Conv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); + swing = getSwing(swing); + + if (isPromiseLayer(swing)) { + + return PROMISE_Conv(input, i_min, i_max, filter, w_min, w_max, bias, b_min, + b_max, conv_pad_h, conv_pad_w, conv_stride_h, + conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } assert(isGPULayer(swing)); - return Autotuner_GPU_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); - + return Autotuner_GPU_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } - - - - /**** Top-level API Unchanged for backwards compatibility ***/ -void* ConvLayer_PROMISE(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - return Autotuner_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_size, // FIXIT: Assumption pool_size == pool_strides - activation_id, - out_min, out_max, swing); - - +void *ConvLayer_PROMISE(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + return Autotuner_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, + pool_size, // FIXIT: Assumption pool_size == pool_strides + activation_id, out_min, out_max, swing); } - - - -void* ConvLayer_PROMISE2(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - return Autotuner_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); - - +void *ConvLayer_PROMISE2(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + return Autotuner_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } +void * +FCLayer_PROMISE(void *input, float i_min, float i_max, void *weights, + float w_min, float w_max, void *bias, float b_min, float b_max, + int activation_id, float out_min, float out_max, + int swing) { // NOTE: min_val, max_val apply to 'ClippedRelu' + swing = getSwing(swing); + if (isPromiseLayer(swing)) { - - - -void* FCLayer_PROMISE(void* input, float i_min, float i_max, - void* weights, float w_min, float w_max, - void* bias, float b_min, float b_max, - int activation_id, - float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu' - - - swing = getSwing(swing); - - if(isPromiseLayer(swing)){ - - return PROMISE_FC(input, i_min, i_max, - weights, w_min, w_max, - bias, b_min, b_max, - activation_id, - out_min, out_max, swing); + return PROMISE_FC(input, i_min, i_max, weights, w_min, w_max, bias, b_min, + b_max, activation_id, out_min, out_max, swing); } assert(isGPULayer(swing)); @@ -1576,18 +1164,12 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, } return activation_out; - } #endif - - #ifdef OLD_MODEL #endif -#endif - - - +#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu index c1848f126750808a9438a4d2cf7729d1bf420fd1..b97e5beadb7822cce12bdf2ee4d16407cd0483c4 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu @@ -1,13 +1,12 @@ //===--------------------------- approxtechniques.cu ---------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of software approximations // for tensor convolutions. The approximations implemented are feature sampling -// and perforation for FP32 and FP16 compute precisions. +// and perforation for FP32 and FP16 compute precisions. // //===----------------------------------------------------------------------===// - #include "tensor_utils.h" #include "approx_utils.h" @@ -17,406 +16,465 @@ #include "fp16_conversion.h" #include "profiling.h" -extern "C"{ - -__global__ void convToGemm(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +extern "C" { + +__global__ void convToGemm(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmFullInput(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_ - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen - if(filter_elem_num % skip_every != skip_every-1-skip_offset) { - int output_col = filter_elem_num - - ((filter_elem_num + skip_every)/skip_every); - if(skip_every == 1) output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void convToGemmFullInput( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number)_ + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter elemen + if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) { + int output_col = + filter_elem_num - ((filter_elem_num + skip_every) / skip_every); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; } + } + } + } } -__global__ void convToGemmHalfInputNew(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_offset) { - int output_col = filter_elem_num - - (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - if(skip_every == 1) output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void +convToGemmHalfInputNew(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != skip_offset) { + int output_col = + filter_elem_num - (filter_elem_num / skip_every + + (filter_elem_num % skip_every > skip_offset)); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; + } } + } + } } - -__global__ -void convToGemmHalf(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i - const int n = tx / (C * H_out * W_out); //output image numbe - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe - const int h = tx % (H_out * W_out) / W_out; //output height index (row number - const int w = tx % W_out; //output width index (col number - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - } else { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; - } - } +__global__ void convToGemmHalf(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i + const int n = tx / (C * H_out * W_out); // output image numbe + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe + const int h = tx % (H_out * W_out) / W_out; // output height index (row number + const int w = tx % W_out; // output width index (col number + const int inH = h * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + const int out_index = + ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - - const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + + const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } - - -__global__ void convToGemmHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmHalf2(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride, + const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmPerfRow(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void +convToGemmPerfRow(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } } } } -__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w, - float *old_data, float *new_data, int x, int start){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - w]) / 2; - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfCol(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void +convToGemmPerfCol(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w, - float *old_data, float *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - 1]) / 2; - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - 1]) / 2; + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmPerfRowHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; } @@ -424,844 +482,903 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, } } -__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image numbe - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - int h_index; - if(h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - - } - } - +__global__ void convToGemmPerfRowHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image numbe + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + int h_index; + if (h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + } + const int inH = h_index * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((filter_elem_num * N + n) * H_eff + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } -__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - const int row_index = row - ((row + 1 - start) / x); - const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + const int row_index = row - ((row + 1 - start) / x); + const int output_index = + ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + const int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + const int output_index = + ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } - -__global__ void convToGemmPerfColHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void convToGemmPerfColHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; - } } } } -__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) - int w_index; - if(w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen - const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfColHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) + int w_index; + if (w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + } + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter elemen + const int out_index = + ((filter_elem_num * N + n) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } - -__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } - } +__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } } -__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; - - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; - - } else if((col - start) % x == 0) { - const int col_index = col - ((col + 1 - start) / x); - const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; + + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + + old_w - 1]; + + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; + + } else if ((col - start) % x == 0) { + const int col_index = col - ((col + 1 - start) / x); + const int output_index = + ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + const int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + const int output_index = + ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } +__global__ void +convToGemmFullInputRegular(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + int in_index; + if (fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } -__global__ void convToGemmFullInputRegular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - int in_index; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmFullInputIrregular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - int in_index; - if(fi < skip_offset) { - in_index = fi; - } else { - in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmFullInputIrregular( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + int in_index; + if (fi < skip_offset) { + in_index = fi; + } else { + in_index = + ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; + } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } - - + } } -__global__ void createReducedFiltersFullRegular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter +__global__ void createReducedFiltersFullRegular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; - int in_index; - if(offset < channel_offset) { - in_index = offset; - } - else { - in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - } - - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; + int in_index; + if (offset < channel_offset) { + in_index = offset; + } else { + in_index = + ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1; + } + + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; } } -__global__ void createReducedFiltersFullIrregular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter - int in_index; - if(offset < skip_offset) { - in_index = offset; - } else { - in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; +__global__ void createReducedFiltersFullIrregular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter + int in_index; + if (offset < skip_offset) { + in_index = offset; + } else { + in_index = + ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; + } } -__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - - const bool condition = (fi < offset); - const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmHalfInputRegular(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + + const bool condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - const int condition = (fi < offset); - const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } - else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputRegular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + const int condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } - else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } +__global__ void createReducedFiltersHalfRegular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { -__global__ void createReducedFiltersHalfRegular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; const int condition = (offset < channel_offset); - const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); - - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - } - + const int in_index = + condition * offset + + (!condition) * + (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1); + + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + } } -__global__ void createReducedFiltersHalfIrregular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { +__global__ void createReducedFiltersHalfIrregular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - - if(fIdx < NF) { + if (fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter + const int offset = tx % reduced_filter_elem; // offset within filter const int condition = (offset < skip_offset); - - int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - } - -} + int in_index = + condition * offset + + (!condition) * + (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + skip_offset - 1); + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + } +} -//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols -__global__ void convToGemmApprox(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? - const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0; - } +// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem +// cols +__global__ void +convToGemmApprox(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != + skip_every - 1) { // are we including this filter element? + const int output_col = + filter_elem_num - + (filter_elem_num / + skip_every); // cal output column, taking skipping into account + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * reduced_filter_elem + output_col) * H_out + h) * + W_out + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * reduced_filter_elem + output_col) * H_out + h) * + W_out + + w] = 0; + } } } } } - /// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is FP32. -/// This routine is invoked by the tuner for tuning approximations for convolutions. +/// with the perforation and filter sampling support. The compute precison is +/// FP32. This routine is invoked by the tuner for tuning approximations for +/// convolutions. /// -void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int start){ - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty +void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - - Tensor* output; + + Tensor *output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); convertToFP32(input); convertToFP32(filter); - + long int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; int rem_row = (h - start) % row > 0; int h_eff = h - ((h - start) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; int rem_col = (w - start) % col > 0; int w_eff = w - ((w - start) / col) - rem_col; - Tensor* new_output; - if(row > 1){ - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *new_output; + if (row > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float* convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, start, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, start, h_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, - num_filter_elem * h_eff * w, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h_eff * w, c * h_eff * w, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateRow<<<numBlocks, 128>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } - else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + } else if (col > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, start, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, start, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, - h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h * w_eff, c * h * w_eff, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateCol<<<numBlocks, 128>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else { - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + } else { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>(convData, - (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - num_filter_elem, c * h * w); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApprox<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, c * h * w); checkCudaErrors(cudaDeviceSynchronize()); - //Do the matrix multiplication - //Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - + // Do the matrix multiplication + // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem, + &alpha, convData, h * w, num_filter_elem * h * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w, c * h * w, n)); new_output = output; cudaFree(convData); } - //Event("Conv_end"); //, true); + // Event("Conv_end"); //, true); return new_output; } -__global__ -void switchMatrixFull(int N, int n, int c, int h, int w, - float *old_data, float *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixFull(int N, int n, int c, int h, int w, + float *old_data, float *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); + + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} /// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is FP32. +/// with the perforation and filter sampling support. The compute precison is +/// FP32. /// -void* tensorConvApprox(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset){ +void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int skip_every, int offset) { //////INFO("*** TensorConvolution approximation \n"); - //Event("Conv"); + // Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } @@ -1275,15 +1392,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////Event("H2F_end"); const int n = input->dims.dim_sizes[0]; - const int c = filter->dims.dim_sizes[0]; //number of filters + const int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); ////INFO("batch: %d\n", n); @@ -1296,619 +1416,572 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////INFO("horizontal_stride: %d\n", horizontal_stride); ////INFO("output height: %d\n", h); ////INFO("output width: %d\n", w); - if(row > 1) { + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, offset, h_eff); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, offset, h_eff); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, - n)); - //interpolate + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateRow<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, offset, w_eff); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, offset, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, - n)); - - //interpolate + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateCol<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(skip_every > 1) { - //reduced number after skipping + } else if (skip_every > 1) { + // reduced number after skipping const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - float* convData; + float *convData; size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - + float *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); + const int filtBlockSize = 128; ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); //////INFO("fac: %f\n", fac); const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); - const int gridSize = (n * h * w + blockSize - 1) / blockSize; - if(!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); - createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + //////INFO("n * h * w : %d\n", (n * h * w )); + const int gridSize = (n * h * w + blockSize - 1) / blockSize; + if (!(KH * KW % skip_every)) { + // ////INFO("REGULAR FILTERING\n"); + createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputRegular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // ////INFO("IRREGULAR FILTERING\n"); - createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + // ////INFO("IRREGULAR FILTERING\n"); + createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputIrregular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } checkCudaErrors(cudaDeviceSynchronize()); - + const float alpha = 1.0; const float beta = 0.0; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - &alpha, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter, + reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w, + c * h * w, n)); cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP32 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + // INFO("FP32 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w)); - convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - skip_every, offset);//num_filter_elem); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w)); + convToGemmFullInput<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + skip_every, offset); // num_filter_elem); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); - */ - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - &alpha, - convData, - CUDA_R_32F, n * h * w, - (float *) filter->gpu_data, CUDA_R_32F, - num_filter_elem, - &beta, - (float *) output->gpu_data, - CUDA_R_32F, n * h * w, - CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data); - + + float alpha = 1.0f, beta = 0.0f; + /* + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w, c, num_filter_elem, + &alpha, + convData, h * w, num_filter_elem * h + * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float + *)new_output->gpu_data, h * w, c * h * w, n)); + */ + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, + CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data, + CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, + (float *)new_output->gpu_data); + checkCudaErrors(cudaDeviceSynchronize()); cudaFree(convData); } - //Event("Conv_end"); + // Event("Conv_end"); return new_output; } -__global__ -void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixHalf(int N, int n, int c, int h, int w, + __half *old_data, __half *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); + + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} -/// This function serves as an API to custom implementation of the +/// This function serves as an API to custom implementation of the /// half-precision convolution with the perforation and filter sampling -/// support. +/// support. /// -void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset) { - - //INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty +void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups, int row, int col, int skip_every, + int offset) { + + // INFO("*** TensorConvolution half approximation \n"); + // profileEvent("#Conv"); + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } hostToDeviceCopy(input); hostToDeviceCopy(filter); - + profileEvent("F2H_start"); - convertToFP16(input); - convertToFP16(filter); + convertToFP16(input); + convertToFP16(filter); profileEvent("F2H_end"); - + const long int n = input->dims.dim_sizes[0]; - const long int c = filter->dims.dim_sizes[0]; //number of filters + const long int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(new_output, DEVICE); - //INFO("batch: %d\n", n); + // INFO("batch: %d\n", n); // INFO("channels: %d\n", input->dims.dim_sizes[1]); // INFO("num_filters: %d\n", c); // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); + // INFO("kernel width: %d\n", KW); // INFO("num_filter_elem: %d\n", num_filter_elem); - //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - //INFO("vertical_stride: %d\n", vertical_stride); - //INFO("horizontal_stride: %d\n", horizontal_stride); + // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); + // INFO("vertical_stride: %d\n", vertical_stride); + // INFO("horizontal_stride: %d\n", horizontal_stride); // INFO("output height: %d\n", h); // INFO("output width: %d\n", w); - //INFO("skip_every: %d\n", skip_every); + // INFO("skip_every: %d\n", skip_every); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(row > 1){ + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, - n, c, h_eff, w); + + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); changeTensorPlacement(output_half, DEVICE); - __half * convData; + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - + const int patchBlockSize = 256; - const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize; + const int numPatchBlocks = + (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / + patchBlockSize; const int interpolationBlocksize = 256; - const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; - if(h * w <= 64) { - //INFO("H *W <= 64\n"); - convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h_eff * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h_eff * w, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - } else { - //INFO("H *W > 64\n"); - convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - alpha_half, - convData, h_eff * w, num_filter_elem * h_eff * w, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, - n)); - - approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); - checkCudaErrors(cudaDeviceSynchronize()); + const int numInterpolationBlocks = + (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; + if (h * w <= 64) { + // INFO("H *W <= 64\n"); + convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + approxInterpolateRowHalf2<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); + checkCudaErrors(cudaDeviceSynchronize()); + } else { + // INFO("H *W > 64\n"); + convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n)); + + approxInterpolateRowHalf<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); + checkCudaErrors(cudaDeviceSynchronize()); } freeTensor(output_half); cudaFree(convData); -} else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); changeTensorPlacement(output_half, DEVICE); - - __half * convData; + + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - + const int patchBlockSize = 256; - const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize; + const int numPatchBlocks = + (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / + patchBlockSize; const int interpolationBlocksize = 256; - const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; - if(h * w <= 64) { - //INFO("H *W <= 64\n"); - convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w_eff, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w_eff, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - checkCudaErrors(cudaDeviceSynchronize()); + const int numInterpolationBlocks = + (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; + if (h * w <= 64) { + // INFO("H *W <= 64\n"); + convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + approxInterpolateColHalf2<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + checkCudaErrors(cudaDeviceSynchronize()); } else { - //INFO("H *W > 64\n"); - convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - alpha_half, - convData, h * w_eff, num_filter_elem * h * w_eff, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, - n)); - - approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - checkCudaErrors(cudaDeviceSynchronize()); + // INFO("H *W > 64\n"); + convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n)); + + approxInterpolateColHalf<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + checkCudaErrors(cudaDeviceSynchronize()); } freeTensor(output_half); cudaFree(convData); - } else if(skip_every > 1) { + } else if (skip_every > 1) { const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - __half* convData; + __half *convData; size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - __half* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); + __half *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); const int filtBlockSize = 256; - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); const int blockSize = 256; - //const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); + // const int gridSize = (n * h * w + blockSize - 1) / blockSize; + // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); + // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { - if(!(KH * KW % skip_every)) { - //INFO("---REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); + if (c * num_filter_elem < + 500000) { // 250) {//c * reduced_filter_elem < 150000) { + if (!(KH * KW % skip_every)) { + // INFO("---REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); } else { - //INFO("---IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); + // INFO("---IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - //convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - alpha_half, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - beta_half, - (__half *)new_output->gpu_half_data, h * w, c * h * w, - n)); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + // convToGemmHalfInputIrregular + convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + alpha_half, convData, h * w, reduced_filter_elem * h * w, + reducedFilter, reduced_filter_elem, 0, beta_half, + (__half *)new_output->gpu_half_data, h * w, c * h * w, n)); } else { - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(output_half, DEVICE); - - if(!(KH * KW % skip_every)) { - //INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } else { - // INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, reduced_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - reducedFilter, CUDA_R_16F, reduced_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + changeTensorPlacement(output_half, DEVICE); + + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } else { + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, + reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w, + reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half, + (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output_half); } - + cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP16 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - changeTensorPlacement(output, DEVICE); - __half * convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //convToGemmHalf - convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, num_filter_elem, - skip_every, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output); - cudaFree(convData); + // INFO("FP16 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + changeTensorPlacement(output, DEVICE); + __half *convData; + long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + // convToGemmHalf + convToGemmHalfInputNew<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, skip_every, offset); + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + alpha_half, convData, CUDA_R_16F, n * h * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half, + (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output); + cudaFree(convData); } profileEvent("H2F_start"); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu index 6e9f88bb54e5655b18d72fc88e5a08a2478ea9fc..bdcfb2c5684d1584e1a520194066fc20e3724632 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu @@ -7,429 +7,489 @@ #include "fp16_conversion.h" #include "profiling.h" -extern "C"{ - -__global__ void convToGemm(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +extern "C" { + +__global__ void convToGemm(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmFullInput(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_ - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen - if(filter_elem_num % skip_every != skip_every-1-skip_offset) { - int output_col = filter_elem_num - - ((filter_elem_num + skip_every)/skip_every); - if(skip_every == 1) - output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void convToGemmFullInput( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number)_ + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter elemen + if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) { + int output_col = + filter_elem_num - ((filter_elem_num + skip_every) / skip_every); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; } + } + } + } } -__global__ void convToGemmHalfInputNew(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_offset) { - int output_col = filter_elem_num - - (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - if(skip_every == 1) - output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void +convToGemmHalfInputNew(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != skip_offset) { + int output_col = + filter_elem_num - (filter_elem_num / skip_every + + (filter_elem_num % skip_every > skip_offset)); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; + } } + } + } } - -__global__ -void convToGemmHalf(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i - const int n = tx / (C * H_out * W_out); //output image numbe - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe - const int h = tx % (H_out * W_out) / W_out; //output height index (row number - const int w = tx % W_out; //output width index (col number - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - } else { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; - } - } +__global__ void convToGemmHalf(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i + const int n = tx / (C * H_out * W_out); // output image numbe + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe + const int h = tx % (H_out * W_out) / W_out; // output height index (row number + const int w = tx % W_out; // output width index (col number + const int inH = h * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + // const int ki = c * KH * KW + i; + // const int kj = c * KH * KW + j; + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + // if(filter_elem_num % skip_every != skip_offset) { + // int output_col = filter_elem_num - + // (filter_elem_num/skip_every + (filter_elem_num % skip_every > + // skip_offset)); + // if(skip_every == 1) + // output_col = filter_elem_num; + const int out_index = + ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + // const int ki = c * KH * KW + i; + // const int kj = c * KH * KW + j; + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + // if(filter_elem_num % skip_every != skip_offset) { + // int output_col = filter_elem_num - + // (filter_elem_num/skip_every + (filter_elem_num % skip_every > + // skip_offset)); + // if(skip_every == 1) + // output_col = filter_elem_num; + const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; + //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } - - -__global__ void convToGemmHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmHalf2(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride, + const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmPerfRow(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void +convToGemmPerfRow(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; + const int inW = w * H_stride - H_pad; // input width index (col number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } } } } -__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w, - float *old_data, float *new_data, int x, int start){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - w]) / 2; - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfCol(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void +convToGemmPerfCol(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w, - float *old_data, float *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - 1]) / 2; - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - 1]) / 2; + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmPerfRowHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inW = w * H_stride - H_pad; // input width index (col number) + // #pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; } @@ -437,872 +497,941 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, } } -__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image numbe - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - int h_index; - if(h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfRowHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image numbe + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + int h_index; + if (h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } -} - -__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image number - const int stride = blockDim.x * gridDim.x; - //if(n < N) { - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - //const int ch = tx % (c * h * w) / (h * w); //filter number - //const int row = tx % (h * w) / w; //output height index (row number) - //const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } + const int inH = h_index * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + // #pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((filter_elem_num * N + n) * H_eff + h) * W_out + w; + //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } -__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image numbe - const int stride = blockDim.x * gridDim.x; - //if(n < N) { - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - //const int ch = tx % (c * h * w) / (h * w); //filter number - //const int row = tx % (h * w) / w; //output height index (row number) - //const int col = tx % w; //output width index (col number - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - const int row_index = row - ((row + 1 - start) / x); - const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + // const int n = tx / (c * h * w); //output image number + const int stride = blockDim.x * gridDim.x; + // if(n < N) { + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + + // const int ch = tx % (c * h * w) / (h * w); //filter number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } +__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { -__global__ void convToGemmPerfColHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + // const int n = tx / (c * h * w); //output image numbe + const int stride = blockDim.x * gridDim.x; + // if(n < N) { + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + + // const int ch = tx % (c * h * w) / (h * w); //filter number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + const int row_index = row - ((row + 1 - start) / x); + const int output_index = + ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; + // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + const int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + const int output_index = + ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; + // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } +} - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void convToGemmPerfColHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inH = h * V_stride - V_pad; // input height index (row number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; - + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) - int w_index; - if(w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen - const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfColHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) + int w_index; + if (w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter elemen + const int out_index = + ((filter_elem_num * N + n) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } + } + } } +__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { -__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); - //const int n = tx / (c * h * w); //output image number - //if(n < N) { - //const int ch = tx % (c * h * w) / (h * w); //output chan number - //const int row = tx % (h * w) / w; //output height index (row number) + // const int n = tx / (c * h * w); //output image number + // if(n < N) { + // const int ch = tx % (c * h * w) / (h * w); //output chan number + // const int row = tx % (h * w) / w; //output height index (row number) // const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } - } + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } } -__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - //const int n = tx / (c * h * w); //output image number - //if(n < N) { - //const int ch = tx % (c * h * w) / (h * w); //output chan number - //const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col]; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - const int col_index = col - ((col + 1 - start) / x); - const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + // const int n = tx / (c * h * w); //output image number + // if(n < N) { + // const int ch = tx % (c * h * w) / (h * w); //output chan number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number) + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col]; + // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + + old_w - 1]; + // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)]; + // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + const int col_index = col - ((col + 1 - start) / x); + const int output_index = + ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; + // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + const int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + const int output_index = + ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; + // const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * + // old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } - -__global__ void convToGemmFullInputRegular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - int in_index; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmFullInputRegular(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + int in_index; + if (fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmFullInputIrregular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - int in_index; - if(fi < skip_offset) { - in_index = fi; - } else { - in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmFullInputIrregular( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + int in_index; + if (fi < skip_offset) { + in_index = fi; + } else { + in_index = + ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; + } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void createReducedFiltersFullRegular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter +__global__ void createReducedFiltersFullRegular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; - int in_index; - if(offset < channel_offset) { - in_index = offset; - } else { - in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; + int in_index; + if (offset < channel_offset) { + in_index = offset; + } else { + in_index = + ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1; + } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; } } -__global__ void createReducedFiltersFullIrregular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter - int in_index; - if(offset < skip_offset) { - in_index = offset; - } else { - in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; +__global__ void createReducedFiltersFullIrregular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter + int in_index; + if (offset < skip_offset) { + in_index = offset; + } else { + in_index = + ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; + } } -__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - //const int ch = (fi * C) / reduced_filter_elem; - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - //int in_index; - const bool condition = (fi < offset); - const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmHalfInputRegular(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + // for(int fi = 0; fi < reduced_filter_elem; fi++) { + // const int ch = (fi * C) / reduced_filter_elem; + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + // int in_index; + const bool condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + // if(fi < offset) { + // in_index = fi; + //} else { + // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + // + offset - 1; + // } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - // const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - const int condition = (fi < offset); - const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - // int in_index; - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputRegular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + // for(int fi = 0; fi < reduced_filter_elem; fi++) { + // const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + const int condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + // int in_index; + // if(fi < offset) { + // in_index = fi; + //} else { + // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > + // 0) + offset - 1; + // } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //int in_index; - //if(fi < skip_offset) { - // in_index = fi; - //} else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + // int in_index; + // if(fi < skip_offset) { + // in_index = fi; + //} else { + // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - skip_offset + 1) * skip_every) % (skip_every - + // 1) > 0) + skip_offset - 1; + // } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - // int in_index; - // if(fi < skip_offset) { - // in_index = fi; - // } else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + // int in_index; + // if(fi < skip_offset) { + // in_index = fi; + // } else { + // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) + // > 0) + skip_offset - 1; + // } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + // const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * + // W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } +__global__ void createReducedFiltersHalfRegular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { -__global__ void createReducedFiltersHalfRegular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id const int stride = blockDim.x * gridDim.x; - + //#pragma unroll for (int i = tx; i < NF; i += stride) { - const int fIdx = i / reduced_filter_elem; //filter index - //if(fIdx < NF) { - const int offset = i % reduced_filter_elem; //offset within filter + const int fIdx = i / reduced_filter_elem; // filter index + // if(fIdx < NF) { + const int offset = i % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; const int condition = (offset < channel_offset); - const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); - - // int in_index; - // if(offset < channel_offset) { - // in_index = offset; - //} else { - // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - // + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; + const int in_index = + condition * offset + + (!condition) * + (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1); + + // int in_index; + // if(offset < channel_offset) { + // in_index = offset; + //} else { + // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - + // 1) + // + (((offset - channel_offset + 1) * skip_every) % (skip_every - + // 1) > 0) + channel_offset -1; // } - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); } } -__global__ void createReducedFiltersHalfIrregular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - //#pragma unroll - for (int i = tx; i < NF; i += stride) { - - const int fIdx = i / reduced_filter_elem; //filter index +__global__ void createReducedFiltersHalfIrregular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; + //#pragma unroll + for (int i = tx; i < NF; i += stride) { + + const int fIdx = i / reduced_filter_elem; // filter index // if(fIdx < NF) { - const int offset = i % reduced_filter_elem; //offset within filter - const int condition = (offset < skip_offset); - int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //} - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + const int offset = i % reduced_filter_elem; // offset within filter + const int condition = (offset < skip_offset); + int in_index = + condition * offset + + (!condition) * + (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + skip_offset - 1); + //} + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); //} } } -void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int start){ +void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start) { //////INFO("*** TensorConvolution (output perforation) \n"); - //Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + // Event("Conv"); + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - - Tensor* output; + + Tensor *output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - //Event("H2F_start"); + // Event("H2F_start"); convertToFP32(input); convertToFP32(filter); - //Event("H2F_end"); - + // Event("H2F_end"); + long int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; int rem_row = (h - start) % row > 0; int h_eff = h - ((h - start) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; int rem_col = (w - start) % col > 0; int w_eff = w - ((w - start) / col) - rem_col; - Tensor* new_output; - if(row > 1){ - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *new_output; + if (row > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float* convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, start, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, start, h_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, - num_filter_elem * h_eff * w, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h_eff * w, c * h_eff * w, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateRow<<<numBlocks, 128>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + } else if (col > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, start, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, start, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, - h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h * w_eff, c * h * w_eff, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateCol<<<numBlocks, 128>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else { - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + } else { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>(convData, - (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - num_filter_elem, c * h * w); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApprox<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, c * h * w); checkCudaErrors(cudaDeviceSynchronize()); - //Do the matrix multiplication - //Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - + // Do the matrix multiplication + // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem, + &alpha, convData, h * w, num_filter_elem * h * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w, c * h * w, n)); new_output = output; cudaFree(convData); } - //Event("Conv_end"); //, true); + // Event("Conv_end"); //, true); return new_output; } -__global__ -void switchMatrixFull(int N, int n, int c, int h, int w, - float *old_data, float *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixFull(int N, int n, int c, int h, int w, + float *old_data, float *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); -void* tensorConvApprox(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset){ + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} + +void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int skip_every, int offset) { //////INFO("*** TensorConvolution approximation \n"); - //Event("Conv"); + // Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } @@ -1316,15 +1445,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////Event("H2F_end"); const int n = input->dims.dim_sizes[0]; - const int c = filter->dims.dim_sizes[0]; //number of filters + const int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); ////INFO("batch: %d\n", n); @@ -1337,327 +1469,299 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////INFO("horizontal_stride: %d\n", horizontal_stride); ////INFO("output height: %d\n", h); ////INFO("output width: %d\n", w); - if(row > 1) { + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, offset, h_eff); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, offset, h_eff); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, - n)); - //interpolate + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateRow<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, offset, w_eff); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, offset, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, - n)); - - //interpolate + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateCol<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(skip_every > 1) { - //reduced number after skipping + } else if (skip_every > 1) { + // reduced number after skipping const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - float* convData; + float *convData; size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - + float *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); + const int filtBlockSize = 128; ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); //////INFO("fac: %f\n", fac); const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); - const int gridSize = (n * h * w + blockSize - 1) / blockSize; - if(!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); - createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + //////INFO("n * h * w : %d\n", (n * h * w )); + const int gridSize = (n * h * w + blockSize - 1) / blockSize; + if (!(KH * KW % skip_every)) { + // ////INFO("REGULAR FILTERING\n"); + createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputRegular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // ////INFO("IRREGULAR FILTERING\n"); - createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + // ////INFO("IRREGULAR FILTERING\n"); + createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputIrregular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } checkCudaErrors(cudaDeviceSynchronize()); - + const float alpha = 1.0; const float beta = 0.0; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - &alpha, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter, + reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w, + c * h * w, n)); cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP32 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + // INFO("FP32 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(new_output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w)); - convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - skip_every, offset);//num_filter_elem); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w)); + convToGemmFullInput<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + skip_every, offset); // num_filter_elem); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); - */ - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - &alpha, - convData, - CUDA_R_32F, n * h * w, - (float *) filter->gpu_data, CUDA_R_32F, - num_filter_elem, - &beta, - (float *) output->gpu_data, - CUDA_R_32F, n * h * w, - CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data); - + + float alpha = 1.0f, beta = 0.0f; + /* + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w, c, num_filter_elem, + &alpha, + convData, h * w, num_filter_elem * h + * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float + *)new_output->gpu_data, h * w, c * h * w, n)); + */ + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, + CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data, + CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, + (float *)new_output->gpu_data); + checkCudaErrors(cudaDeviceSynchronize()); cudaFree(convData); } - //Event("Conv_end"); + // Event("Conv_end"); return new_output; } -__global__ -void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixHalf(int N, int n, int c, int h, int w, + __half *old_data, __half *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); -void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset) { + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} + +void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups, int row, int col, int skip_every, + int offset) { - //INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); + // INFO("*** TensorConvolution half approximation \n"); + // profileEvent("#Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } hostToDeviceCopy(input); hostToDeviceCopy(filter); - // INFO("CONVERT\n"); + // INFO("CONVERT\n"); profileEvent("F2H_start"); - convertToFP16(input); - convertToFP16(filter); + convertToFP16(input); + convertToFP16(filter); profileEvent("F2H_end"); -//INFO("CONVERTED\n"); + // INFO("CONVERTED\n"); const long int n = input->dims.dim_sizes[0]; - const long int c = filter->dims.dim_sizes[0]; //number of filters + const long int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //INFO("batch: %d\n", n); + // INFO("batch: %d\n", n); // INFO("channels: %d\n", input->dims.dim_sizes[1]); // INFO("num_filters: %d\n", c); // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); + // INFO("kernel width: %d\n", KW); // INFO("num_filter_elem: %d\n", num_filter_elem); - //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - //INFO("vertical_stride: %d\n", vertical_stride); - //INFO("horizontal_stride: %d\n", horizontal_stride); + // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); + // INFO("vertical_stride: %d\n", vertical_stride); + // INFO("horizontal_stride: %d\n", horizontal_stride); // INFO("output height: %d\n", h); // INFO("output width: %d\n", w); - //INFO("skip_every: %d\n", skip_every); - if(row > 1){ + // INFO("skip_every: %d\n", skip_every); + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, - n, c, h_eff, w); + + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - __half * convData; + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - if(h * w <= 64) { - convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + if (h * w <= 64) { + convToGemmPerfRowHalf2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); } else { - convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); + convToGemmPerfRowHalf<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); } checkCudaErrors(cudaDeviceSynchronize()); @@ -1665,74 +1769,68 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(h * w <= 64) { - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h_eff * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h_eff * w, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + if (h * w <= 64) { + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - alpha_half, - convData, h_eff * w, num_filter_elem * h_eff * w, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, - n)); + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n)); } - //interpolate + // interpolate int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if(h * w <= 64) { - approxInterpolateRowHalf2<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + if (h * w <= 64) { + approxInterpolateRowHalf2<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); } else { - approxInterpolateRowHalf<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); + approxInterpolateRowHalf<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); } checkCudaErrors(cudaDeviceSynchronize()); freeTensor(output_half); cudaFree(convData); -} else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - - __half * convData; + + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - if(h * w <= 64) { - convToGemmPerfColHalf2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + if (h * w <= 64) { + convToGemmPerfColHalf2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); } else { - convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); + convToGemmPerfColHalf<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); } checkCudaErrors(cudaDeviceSynchronize()); @@ -1740,229 +1838,211 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(h * w <= 64) { - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w_eff, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w_eff, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + if (h * w <= 64) { + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - alpha_half, - convData, h * w_eff, num_filter_elem * h * w_eff, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, - n)); + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n)); } - //interpolate + // interpolate int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if(h * w <= 64) { - approxInterpolateColHalf2<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + if (h * w <= 64) { + approxInterpolateColHalf2<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); } else { - approxInterpolateColHalf<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); + approxInterpolateColHalf<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); freeTensor(output_half); cudaFree(convData); - } else if(skip_every > 1) { + } else if (skip_every > 1) { const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - __half* convData; + __half *convData; size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - __half* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); + __half *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); const int filtBlockSize = 256; - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); const int blockSize = 256; - //const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); + // const int gridSize = (n * h * w + blockSize - 1) / blockSize; + // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); + // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { - if(!(KH * KW % skip_every)) { - //INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); + if (c * num_filter_elem < + 500000) { // 250) {//c * reduced_filter_elem < 150000) { + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); } else { - //INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - //convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - alpha_half, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - beta_half, - (__half *)new_output->gpu_half_data, h * w, c * h * w, - n)); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + // convToGemmHalfInputIrregular + convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + alpha_half, convData, h * w, reduced_filter_elem * h * w, + reducedFilter, reduced_filter_elem, 0, beta_half, + (__half *)new_output->gpu_half_data, h * w, c * h * w, n)); } else { - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(output_half, DEVICE); - - if(!(KH * KW % skip_every)) { - // INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } else { - //INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, reduced_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - reducedFilter, CUDA_R_16F, reduced_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + changeTensorPlacement(output_half, DEVICE); + + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } else { + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, + reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w, + reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half, + (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output_half); } - + cudaFree(convData); cudaFree(reducedFilter); } else { // INFO("BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - changeTensorPlacement(output, DEVICE); - __half * convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //convToGemmHalf - convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, num_filter_elem, - skip_every, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output); - cudaFree(convData); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + changeTensorPlacement(output, DEVICE); + __half *convData; + long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + // convToGemmHalf + convToGemmHalfInputNew<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, skip_every, offset); + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + alpha_half, convData, CUDA_R_16F, n * h * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half, + (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output); + cudaFree(convData); } -// INFO("CONV DONE\n"); + // INFO("CONV DONE\n"); profileEvent("H2F_start"); convertToFP32_offline(new_output); - //convertToFP32(input); - //convertToFP32(filter); + // convertToFP32(input); + // convertToFP32(filter); profileEvent("H2F_end"); - //profileEvent("#Conv_end"); - //INFO("CONVOLUTION END\n"); + // profileEvent("#Conv_end"); + // INFO("CONVOLUTION END\n"); return new_output; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp deleted file mode 100644 index 0fe6c20ca848c1caf8180735db9d5cce2f3b2f82..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp +++ /dev/null @@ -1,136 +0,0 @@ -#include "functional/common.h" -#include "tensor_utils.h" - -#include <algorithm> -#include <functional> -#include <numeric> -#include <stdexcept> - -// TODO: this approach does not scale well. -// The right way is probably implementing some type_traits for the Tensor_type_t -// enum -template <> float *convertAndGetGPUData<float>(Tensor *t) { - if (t->cur_type == float_type) - return static_cast<float *>(t->gpu_data); - if (t->cur_type == half_type) { - convertToFP32(t); - t->data_type = float_type; - return static_cast<float *>(t->gpu_data); - } - ERROR("Type %s is incompatible with target type float\n", - std::to_string(t->cur_type)); -} - -template <> half *convertAndGetGPUData<half>(Tensor *t) { - if (t->cur_type == half_type) - return static_cast<half *>(t->gpu_half_data); - if (t->cur_type == float_type) { - convertToFP16(t); - t->data_type = half_type; - return static_cast<half *>(t->gpu_half_data); - } - ERROR("Type %s is incompatible with target type half\n", - std::to_string(t->cur_type)); -} - -template <> float2 *convertAndGetGPUData<float2>(Tensor *t) { - if (t->cur_type == float2_type) - return static_cast<float2 *>(t->gpu_data); - if (t->cur_type == half2_type) { - // FIXME: hacking to make convertToFP16 realize these are "2 floats" - t->num_elems *= 2; - convertToFP32(t); - t->num_elems /= 2; - t->cur_type = t->data_type = float2_type; - return static_cast<float2 *>(t->gpu_data); - } - ERROR("Type %s is incompatible with target type float2\n", - std::to_string(t->cur_type)); -} - -template <> half2 *convertAndGetGPUData<half2>(Tensor *t) { - if (t->cur_type == half2_type) - return static_cast<half2 *>(t->gpu_half_data); - if (t->cur_type == float2_type) { - // FIXME: hacking to make convertToFP16 realize these are "2 floats" - t->num_elems *= 2; - convertToFP16(t); - t->num_elems /= 2; - t->cur_type = t->data_type = half2_type; - return static_cast<half2 *>(t->gpu_half_data); - } - ERROR("Type %s is incompatible with target type half2\n", - std::to_string(t->cur_type)); -} - -void convertToFloat2Offline(Tensor *t) { - if (t->cur_type == float2_type) - return; - else if (t->cur_type == half2_type) { - t->cur_type = t->data_type = half_type; - t->num_elems *= 2; - convertToFP32_offline(t); - t->num_elems /= 2; - t->cur_type = t->data_type = float2_type; - } else { - ERROR("Type %s is incompatible with target type half2\n", - std::to_string(t->cur_type)); - } -} - -std::vector<size_t> sizes(const Dimension &dim) { - return std::vector<size_t>(dim.dim_sizes, dim.dim_sizes + dim.num_dims); -} - -std::vector<size_t> sizes(Tensor *t) { return sizes(t->dims); } - -size_t num_elems(const std::vector<size_t> &dim_sizes) { - return std::accumulate(dim_sizes.begin(), dim_sizes.end(), 1, - std::multiplies<>()); -} - -size_t num_elems(const Dimension &dim) { return num_elems(sizes(dim)); } - -size_t num_elems(Tensor *t) { return num_elems(sizes(t)); } - -static Tensor_type_t toHalfType(Tensor_type_t float_ty) { - switch (float_ty) { - case float_type: - return half_type; - case float2_type: - return half2_type; - case half_type: - case half2_type: - return float_ty; - default: - ERROR("Types not acceptable\n"); - } -} - -static Tensor_type_t toFloatType(Tensor_type_t half_ty) { - switch (half_ty) { - case half_type: - return float_type; - case half2_type: - return float2_type; - case float_type: - case float2_type: - return half_ty; - default: - ERROR("Types not acceptable\n"); - } -} - -Tensor_type_t getCompatibleType(int t1, int t2, bool get_half) { - auto type1 = (Tensor_type_t)t1, type2 = (Tensor_type_t)t2; - if (getTypeSize(type1) > getTypeSize(type2)) - std::swap(type1, type2); - if (type1 == type2) - return get_half ? toHalfType(type1) - : toFloatType(type1); // Or type2, whatever - if (type1 == half_type && type2 == float_type) - return get_half ? half_type : float_type; - if (type1 == half2_type && type2 == float2_type) - return get_half ? half2_type : float2_type; - ERROR("Types not acceptable\n"); -} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp index fd1492fe68e8833ea4cdca4d5df6518b6ec3b37c..c18ffcea26f93fe752500983f4d4a3fcfe59ded2 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp @@ -1,13 +1,13 @@ -//===--------------------------- configuration.cpp -------------------------===// +//===--------------------------- configuration.cpp +//-------------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the definitions of API to get information about +// +// This file consists of the definitions of API to get information about // configurations for rest of the tensor runtime to use. // //===----------------------------------------------------------------------===// - #include "configuration.h" using G_APPROX = GPUNodeConfiguration::APPROX; @@ -31,9 +31,8 @@ void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) { void GPUNodeConfiguration::pushNewApproximationChoiceForOperation( G_APPROX approx, int u) { unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT( - size >= 1 && - "Cannot apply approximation choice to non existent operation."); + CUSTOM_ASSERT(size >= 1 && + "Cannot apply approximation choice to non existent operation."); ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); } @@ -55,9 +54,8 @@ void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) { void CPUNodeConfiguration::pushNewApproximationChoiceForOperation( C_APPROX approx, int u) { unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT( - size >= 1 && - "Cannot apply approximation choice to non existent operation."); + CUSTOM_ASSERT(size >= 1 && + "Cannot apply approximation choice to non existent operation."); ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); } @@ -71,8 +69,8 @@ CPUNodeConfiguration::CPUNodeConfiguration() { } CPUNodeConfiguration::~CPUNodeConfiguration() {} -Configuration::Configuration( - std::string &n, float f, float e, float a, float al) +Configuration::Configuration(std::string &n, float f, float e, float a, + float al) : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {} float Configuration::getSpeedup() { return speedup; } @@ -82,20 +80,20 @@ float Configuration::getEnergy() { return energy; } float Configuration::getAccuracy() { return accuracy; } float Configuration::getAccuracyLoss() { return accuracyLoss; } -bool ConfigurationLessThan:: -operator()(const struct Configuration &a, const struct Configuration &b) const { +bool ConfigurationLessThan::operator()(const struct Configuration &a, + const struct Configuration &b) const { return (a.accuracyLoss < b.accuracyLoss); } -bool ConfigurationLessThan_AL:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_AL::operator()(const struct Configuration *a, + const float &b) const { return (a->accuracyLoss < b); } -bool ConfigurationLessThan_SP:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_SP::operator()(const struct Configuration *a, + const float &b) const { return (a->speedup < b); } -bool ConfigurationLessThan_E:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_E::operator()(const struct Configuration *a, + const float &b) const { return (a->energy < b); } @@ -286,9 +284,8 @@ void CPUNodeConfiguration::print() { void Configuration::print() { printf("+++++\n"); - printf( - "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, - accuracyLoss); + printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, + accuracyLoss); for (std::map<std::string, NodeConfiguration *>::const_iterator it = setup.begin(); it != setup.end(); ++it) { diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc deleted file mode 100644 index 3e4aecb824a93b932ef2146380b86496f71b0f28..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc +++ /dev/null @@ -1,76 +0,0 @@ - - -#ifndef RUNTIME_DEBUG -#define RUNTIME_DEBUG - -#define LOG_DEBUG 0 // Sets the debug logging to true -#define LOG_INFO 1 // Sets the info logging to true -#define LOG_ERROR 1 // Print Errors -#define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro) - -#include "debug.h" -#include "tensor.h" -#include <sstream> -#include <stdarg.h> -#include <stdio.h> -#include <stdlib.h> - -void INFO(const char *format, ...) { - if (!LOG_INFO) // Don't print if logging info is disabled - return; - va_list args; - va_start(args, format); - printf("INFO: "); - vprintf(format, args); - va_end(args); -} - -void DEBUG(const char *format, ...) { - if (!LOG_DEBUG) // Don't print if logging info is disabled - return; - va_list args; - va_start(args, format); - printf("DEBUG: "); - vprintf(format, args); - va_end(args); -} - -void ERROR(const char *format, ...) { - if (!LOG_ERROR) // Don't print if logging info is disabled - return; - va_list args; - va_start(args, format); - printf("ERROR!: "); - vprintf(format, args); - va_end(args); - - abort(); -} - -void fillOnes(struct Tensor *tensor) { - // initialization is specific to the floating point type - if (tensor->data_type == CUDNN_DATA_FLOAT) { - float *data_arr = (float *)tensor->host_data; - for (unsigned int i = 0; i < tensor->num_elems; i++) { - data_arr[i] = 1.0; - } - } -} - -void printTensorDescInfo(struct Tensor *tensor) { - - cudnnDataType_t dType; - int nStride, cStride, hStride, wStride; - int size1, size2, size3, size4; - cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2, - &size3, &size4, &nStride, &cStride, &hStride, - &wStride); - - DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n", dType, - size1, size2, size3, size4); - - DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride, - cStride, hStride, wStride); -} - -#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp index 9bec84de77fc279547eaaba8410c0e25ba3f3cd0..8e5e1fe9689853ee3ff547b62c5d44660db27b04 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cpp @@ -1,8 +1,69 @@ #include "debug.h" +#include "tensor.h" #include <cstdarg> #include <cstdio> #include <cuda_runtime_api.h> #include <stdexcept> +#include <sstream> +#include <cstdlib> + +void INFO(const char *format, ...) { + if (!LOG_INFO) // Don't print if logging info is disabled + return; + va_list args; + va_start(args, format); + printf("INFO: "); + vprintf(format, args); + va_end(args); +} + +void DEBUG(const char *format, ...) { + if (!LOG_DEBUG) // Don't print if logging info is disabled + return; + va_list args; + va_start(args, format); + printf("DEBUG: "); + vprintf(format, args); + va_end(args); +} + +void ERROR(const char *format, ...) { + if (!LOG_ERROR) // Don't print if logging info is disabled + return; + va_list args; + va_start(args, format); + printf("ERROR!: "); + vprintf(format, args); + va_end(args); + + abort(); +} + +void fillOnes(struct Tensor *tensor) { + // initialization is specific to the floating point type + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = 1.0; + } + } +} + +void printTensorDescInfo(struct Tensor *tensor) { + + cudnnDataType_t dType; + int nStride, cStride, hStride, wStride; + int size1, size2, size3, size4; + cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2, + &size3, &size4, &nStride, &cStride, &hStride, + &wStride); + + DEBUG("dType = %d, size1 = %d, size2 = %d, size3 = %d, size4 = %d \n", dType, + size1, size2, size3, size4); + + DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride, + cStride, hStride, wStride); +} void throwError(const char *file, int line, const char *fmt, ...) { char msg[2048]; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu index 0e05813bb6eb5de86057bf3b2066c8fd98642e8d..032443bd7a63a1640e463c0457dd362e09733be3 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu @@ -12,8 +12,8 @@ #define CASE_FUNC(ename, fname) \ case MathOp::ename: { \ void *v_func_ptr = nullptr; \ - checkCudaErrors(cudaMemcpyFromSymbol( \ - &v_func_ptr, _internal::fname##_ptr, sizeof(void *))); \ + checkCudaErrors(cudaMemcpyFromSymbol(&v_func_ptr, _internal::fname##_ptr, \ + sizeof(void *))); \ return v_func_ptr; \ } @@ -120,7 +120,7 @@ template <> void *mathOpToFunc<float2>(MathOp op) { CASE_FUNC(Mul, f2mul) default: ERROR("Float2 function not found\n"); - return nullptr; // For some compilers + return nullptr; // For some compilers } } @@ -129,7 +129,7 @@ template <> void *mathOpToFunc<half2>(MathOp op) { CASE_FUNC(Mul, h2mul) default: ERROR("Half2 function not found\n"); - return nullptr; // For some compilers + return nullptr; // For some compilers } } @@ -151,7 +151,7 @@ template <> void *mathOpToFunc<float>(MathOp op) { default: ERROR("Float function not found\n"); } - return nullptr; // For some compilers + return nullptr; // For some compilers } template <> void *mathOpToFunc<half>(MathOp op) { @@ -169,7 +169,7 @@ template <> void *mathOpToFunc<half>(MathOp op) { default: ERROR("Half function not found\n"); } - return nullptr; // For some compilers + return nullptr; // For some compilers } template <> half reduceOpToIdentity<half>(MathOp op) { @@ -185,5 +185,5 @@ template <> half reduceOpToIdentity<half>(MathOp op) { default: ERROR("Operator does not have id value\n"); } - return 0.0f; // For some compilers + return 0.0f; // For some compilers } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu index 7a700b435efe464153fbba7997662c7dfa970385..638e06e786a8d8e4c587d4bda5d0223fa386f39a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu @@ -2,7 +2,6 @@ #ifndef ERROR_HEADER #define ERROR_HEADER - #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -23,7 +22,6 @@ #include <math.h> #include <assert.h> - #include "debug.h" #include "tensor.h" #include "profiling.h" @@ -31,39 +29,33 @@ #include "global_data.h" #include "error.h" +extern "C" { +void readSkipTensors(int *skip_tensor_ids, int op_count) { -extern "C"{ - - -void readSkipTensors(int* skip_tensor_ids, int op_count){ - - for(int i = 0; i < op_count; i++){ + for (int i = 0; i < op_count; i++) { int tensor_id = skip_tensor_ids[i]; skip_tensors[tensor_id] = 1; } - } - - -void readOpenTunerFlags(const char* file_name){ +void readOpenTunerFlags(const char *file_name) { total_ops = 0; op_counter = 0; op_accuracies.clear(); - - FILE* fp = fopen(file_name, "r"); - if(fp == NULL){ + + FILE *fp = fopen(file_name, "r"); + if (fp == NULL) { DEBUG("\n WARNING: File 'opentuner_flags' not found \n\n\n"); return; } - + int retVal = 200; - while(retVal != EOF){ + while (retVal != EOF) { int op_acc; - if(fp != NULL) + if (fp != NULL) retVal = fscanf(fp, "%d", &op_acc); else op_acc = 0; @@ -75,24 +67,23 @@ void readOpenTunerFlags(const char* file_name){ fclose(fp); } - -void readQuantRanges(char* file_name){ +void readQuantRanges(char *file_name) { total_ops = 0; op_counter = 0; quant_ranges.clear(); - - FILE* fp = fopen(file_name, "r"); - if(fp == NULL){ + + FILE *fp = fopen(file_name, "r"); + if (fp == NULL) { ERROR("File %s not found \n", file_name); } - + int retVal = 200; - while(retVal != EOF && retVal != -1){ + while (retVal != EOF && retVal != -1) { int min; int max; - if(fp != NULL){ + if (fp != NULL) { retVal = fscanf(fp, "%d", &min); printf("min =% d \n", min); @@ -100,22 +91,18 @@ void readQuantRanges(char* file_name){ printf("max =% d \n", max); } - if(retVal != -1){ - struct Range* range = (struct Range*) malloc(sizeof(struct Range)); + if (retVal != -1) { + struct Range *range = (struct Range *)malloc(sizeof(struct Range)); range->min = min; range->max = max; quant_ranges.push_back(range); total_ops++; } } - + fclose(fp); } - - - - /*__device__ inline void atomicAdd(float* address, float value) { @@ -133,11 +120,7 @@ void readQuantRanges(char* file_name){ }; */ - - - - -Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNorms(Tensor *x, Tensor *x_orig) { deviceToHostCopy(x); deviceToHostCopy(x_orig); @@ -148,18 +131,18 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ float inf_norm = -1.0; double total = 0.0; - float* arr1 = (float*) x->host_data; - float* arr2 = (float*) x_orig->host_data; - - for(unsigned int i = 0; i < x->num_elems; i++){ + float *arr1 = (float *)x->host_data; + float *arr2 = (float *)x_orig->host_data; + + for (unsigned int i = 0; i < x->num_elems; i++) { total = total + arr2[i]; - + float diff = abs(arr1[i] - arr2[i]); l1_norm += diff; - l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]); + l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]); - if(inf_norm < diff) + if (inf_norm < diff) inf_norm = diff; } @@ -170,12 +153,11 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ l1_norm = l1_norm / distribution_mean; l2_norm = l2_norm / distribution_mean; - - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); norms->l1_norm = l1_norm; norms->l2_norm = l2_norm; - norms->inf_norm = inf_norm; - + norms->inf_norm = inf_norm; + INFO("l1_norm = %f \n", l1_norm); INFO("l2_norm = %f \n", l2_norm); INFO("inf_norm = %f \n", inf_norm); @@ -183,9 +165,7 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ return norms; } - - -Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNorms2(Tensor *x, Tensor *x_orig) { deviceToHostCopy(x); deviceToHostCopy(x_orig); @@ -196,50 +176,49 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ double l1_norm_A = 0.0; double l1_norm_B = 0.0; - + double l2_norm_A = 0.0; double l2_norm_B = 0.0; float inf_norm = -1.0; float orig_inf_norm = -1.0; double total_diff = 0.0; double total_diff_squared = 0.0; - - float* arr1 = (float*) x->host_data; - float* arr2 = (float*) x_orig->host_data; - - for(unsigned int i = 0; i < x->num_elems; i++){ - if(arr2[i] != 0.0) + float *arr1 = (float *)x->host_data; + float *arr2 = (float *)x_orig->host_data; + + for (unsigned int i = 0; i < x->num_elems; i++) { + + if (arr2[i] != 0.0) l0_norm_A = l0_norm_A + 1.0; - if(arr1[i] != 0.0) + if (arr1[i] != 0.0) l0_norm_B = l0_norm_B + 1.0; - + l1_norm_A = l1_norm_A + abs(arr2[i]); l1_norm_B = l1_norm_B + abs(arr1[i]); l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]); l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]); - + float diff = abs(arr1[i] - arr2[i]); total_diff = total_diff + diff; float diff_squared = diff * diff; - total_diff_squared = total_diff_squared + diff_squared; - + total_diff_squared = total_diff_squared + diff_squared; - if(orig_inf_norm < diff){ + if (orig_inf_norm < diff) { orig_inf_norm = diff; } - + // Relative difference value - float normalized_diff = diff / arr2[i]; - if(inf_norm < normalized_diff){ + float normalized_diff = diff / arr2[i]; + if (inf_norm < normalized_diff) { inf_norm = normalized_diff; - } + } } // Relative L1 and Mean L1 norms of the difference Matrix - float mean_l1 = ( total_diff ) / x->num_elems; - float relative_l1 = ( total_diff ) / l1_norm_A; + float mean_l1 = (total_diff) / x->num_elems; + float relative_l1 = (total_diff) / l1_norm_A; // Computing Relative L2 norm - i.e., Euclidean distance double norm_root_A = sqrt(l2_norm_A); @@ -248,8 +227,9 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = orig_inf_norm; @@ -257,8 +237,8 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ // Relative metrics (relative to distribution) - suitable for PROMISE norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = inf_norm; - + norms->inf_norm = inf_norm; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); INFO("inf_norm = %f \n", inf_norm); @@ -266,33 +246,28 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ return norms; } - - - - -__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A, - double* l1_diff, double* l2_diff, unsigned int n){ +__global__ void normComputeKernel(float *A, float *B, double *l1_A, + double *l2_A, double *l1_diff, + double *l2_diff, unsigned int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < n){ - + if (i < n) { + double diff = fabsf(A[i] - B[i]); - double diff_squared = diff * diff; + double diff_squared = diff * diff; - atomicAdd( l1_A, fabsf(A[i]) ); - atomicAdd( l2_A, (A[i] * A[i]) ); + atomicAdd(l1_A, fabsf(A[i])); + atomicAdd(l2_A, (A[i] * A[i])); - atomicAdd( l1_diff, diff); - atomicAdd( l2_diff, diff_squared); + atomicAdd(l1_diff, diff); + atomicAdd(l2_diff, diff_squared); } } - - __inline__ __device__ double warpReduceSum(double val) { - for (int offset = warpSize/2; offset > 0; offset /= 2) + for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down_sync(0xFFFFFFFF, val, offset); return val; @@ -304,36 +279,34 @@ __inline__ __device__ double blockReduceSum(double val) { int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; - val = warpReduceSum(val); // Each warp performs partial reduction + val = warpReduceSum(val); // Each warp performs partial reduction if (lane == 0) - shared[wid]=val; // Write reduced value to shared memory + shared[wid] = val; // Write reduced value to shared memory - - __syncthreads(); // Wait for all partial reductions + __syncthreads(); // Wait for all partial reductions - - //read from shared memory only if that warp existed + // read from shared memory only if that warp existed val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - if (wid == 0) val = warpReduceSum(val); //Final reduce within first warp + if (wid == 0) + val = warpReduceSum(val); // Final reduce within first warp return val; - } - - -__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N, - double* A_l1, double* A_l2, - double* diff_l1, double* diff_l2) { +__global__ void deviceReduceBlockAtomicKernel(float *A, float *B, int N, + double *A_l1, double *A_l2, + double *diff_l1, + double *diff_l2) { double sum_A_l1 = double(0); double sum_A_l2 = double(0); double sum_diff_l1 = double(0); double sum_diff_l2 = double(0); - for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { sum_A_l1 += fabsf(A[i]); sum_A_l2 += (A[i] * A[i]); @@ -347,31 +320,28 @@ __global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N, sum_A_l2 = blockReduceSum(sum_A_l2); sum_diff_l1 = blockReduceSum(sum_diff_l1); sum_diff_l2 = blockReduceSum(sum_diff_l2); - - if (threadIdx.x == 0){ + + if (threadIdx.x == 0) { atomicAdd(A_l1, sum_A_l1); atomicAdd(A_l2, sum_A_l2); atomicAdd(diff_l1, sum_diff_l1); atomicAdd(diff_l2, sum_diff_l2); - } + } } - -void deviceReduce(float* A, float* B, int N, - double* A_l1, double* A_l2, - double* diff_l1, double* diff_l2) { +void deviceReduce(float *A, float *B, int N, double *A_l1, double *A_l2, + double *diff_l1, double *diff_l2) { int threads = 512; int blocks = min((N + threads - 1) / threads, 1024); - deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, diff_l1, diff_l2); + deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, + diff_l1, diff_l2); //-- deviceReduceKernel<<<1, 1024>>>(out, out, blocks); } - - // Compute Norms on the GPU -Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNormsTreeReduction(Tensor *x, Tensor *x_orig) { hostToDeviceCopy(x); hostToDeviceCopy(x_orig); @@ -388,26 +358,27 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ double *l2_norm_A_d; double *l1_diff_d; double *l2_diff_d; - - cudaMalloc( (void**) &l1_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l2_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l1_diff_d, sizeof(double)); - cudaMalloc( (void**) &l2_diff_d, sizeof(double)); - - - float* arr1 = (float*) x->gpu_data; - float* arr2 = (float*) x_orig->gpu_data; - - //normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); - deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d); - + + cudaMalloc((void **)&l1_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l2_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l1_diff_d, sizeof(double)); + cudaMalloc((void **)&l2_diff_d, sizeof(double)); + + float *arr1 = (float *)x->gpu_data; + float *arr2 = (float *)x_orig->gpu_data; + + // normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, + // l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); + deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, + l2_diff_d); + cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost); INFO("l1_norm_A = %f, l2_norm_A = %f, l1_diff = %f, l2_diff = %f \n", - l1_norm_A, l2_norm_A,l1_diff, l2_diff); + l1_norm_A, l2_norm_A, l1_diff, l2_diff); // Relative L1 and Mean L1 norms of the difference Matrix float mean_l1 = l1_diff / x->num_elems; @@ -420,34 +391,32 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = 0.0; - // Relative metrics (relative to distribution) + // Relative metrics (relative to distribution) norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = 0.0; - + norms->inf_norm = 0.0; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); return norms; } - - - // Compute Norms on the GPU -Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNormsGPU(Tensor *x, Tensor *x_orig) { hostToDeviceCopy(x); hostToDeviceCopy(x_orig); // FIXIT: Move all floats to doubles - overflow is possible - + double l1_norm_A; double l2_norm_A; @@ -459,27 +428,26 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ double *l2_norm_A_d; double *l1_diff_d; double *l2_diff_d; - - cudaMalloc( (void**) &l1_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l2_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l1_diff_d, sizeof(double)); - cudaMalloc( (void**) &l2_diff_d, sizeof(double)); - - - float* arr1 = (float*) x->gpu_data; - float* arr2 = (float*) x_orig->gpu_data; + + cudaMalloc((void **)&l1_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l2_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l1_diff_d, sizeof(double)); + cudaMalloc((void **)&l2_diff_d, sizeof(double)); + + float *arr1 = (float *)x->gpu_data; + float *arr2 = (float *)x_orig->gpu_data; int blockSize = 1024; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); + int gridSize = (int)ceil((float)x->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); - normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); + normComputeKernel<<<gridSize, blockSize>>>( + arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost); - // Relative L1 and Mean L1 norms of the difference Matrix float mean_l1 = l1_diff / x->num_elems; @@ -492,8 +460,9 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = 0.0; @@ -501,355 +470,47 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ // Relative metrics (relative to distribution) - suitable for PROMISE norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = 0.0; - + norms->inf_norm = 0.0; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); return norms; } - - - -__global__ void vecConstMul(float* A, float mul_factor, int n){ +__global__ void vecConstMul(float *A, float mul_factor, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = A[id] * mul_factor; + if (id < n) + A[id] = A[id] * mul_factor; } - -__global__ void vecRound(float* A, int n){ +__global__ void vecRound(float *A, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = roundf(A[id]); + if (id < n) + A[id] = roundf(A[id]); } - -__global__ void vecConstDiv(float* A, float div_factor, int n){ +__global__ void vecConstDiv(float *A, float div_factor, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = A[id] / div_factor; + if (id < n) + A[id] = A[id] / div_factor; } - - -__global__ void vecMul(float* A, float* B, int n){ +__global__ void vecMul(float *A, float *B, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - B[id] = A[id] * B[id]; -} - - -/**** ERROR injecion routines ******/ - -void initRandValues(Tensor* bias, int error_scale){ - - float scaling_values[20]; - - // FIXIT: Error knob 0 should be 0 zero - scaling_values[0] = 0.000; - scaling_values[1] = 0.0005; - scaling_values[2] = 0.03; - scaling_values[3] = 0.06; - scaling_values[4] = 0.08; - scaling_values[5] = 0.105; - scaling_values[6] = 0.134; - scaling_values[7] = 0.16; - scaling_values[8] = 0.2; - scaling_values[9] = 0.23; - scaling_values[10] = 0.26; - scaling_values[11] = 0.3; - scaling_values[12] = 0.35; - scaling_values[13] = 0.4; - scaling_values[14] = 0.45; - scaling_values[15] = 0.55; - scaling_values[16] = 0.65; - scaling_values[17] = 0.7; - scaling_values[18] = 0.8; - scaling_values[19] = 0.9; - - - curandGenerator_t gen; - - struct timespec ts; - - if(timespec_get(&ts, TIME_UTC) == 0){ - printf("crashed \n"); - abort(); - } - - curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); - - curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec); - - curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]); - + if (id < n) + B[id] = A[id] * B[id]; } - - -void initRandValues2(Tensor* bias, int error_scale){ - - float scaling_values[20]; - - // FIXIT: Error knob 0 should be 0 zero - scaling_values[0] = 0.000; - scaling_values[1] = 0.0005; - scaling_values[2] = 0.0008; - scaling_values[3] = 0.001; - scaling_values[4] = 0.005; - scaling_values[5] = 0.01; - scaling_values[6] = 0.02; - scaling_values[7] = 0.03; - scaling_values[8] = 0.04; - scaling_values[9] = 0.05; - scaling_values[10] = 0.06; - scaling_values[11] = 0.08; - scaling_values[12] = 0.1; - scaling_values[13] = 0.12; - scaling_values[14] = 0.15; - scaling_values[15] = 0.2; - scaling_values[16] = 0.55; - scaling_values[17] = 0.6; - scaling_values[18] = 0.65; - scaling_values[19] = 0.7; - - - curandGenerator_t gen; - - struct timespec ts; - - if(timespec_get(&ts, TIME_UTC) == 0){ - printf("crashed \n"); - abort(); - } - - curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); - - curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec); - - curandGenerateNormal(gen, (float*) bias->gpu_data, bias->num_elems, 0.0, 1.0 * scaling_values[error_scale]); - -} - - -void* addBitError(void* x_ptr, int error_scale){ - - if(error_scale > 6 || error_scale < 0){ - ERROR("Error Scale out of bounds \n"); - } - - INFO("*** TensorBitError \n"); - profileEvent("tensorBitError"); - - Tensor* x = (Tensor*) x_ptr; - - size_t* dim_sizes = x->dims.dim_sizes; - Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - - // Copying x data into x_original - for computing Norms - tensorCopy(x, x_original); - - // Quadratic Error - float freq_factors[6]; - freq_factors[0] = 0.1; - freq_factors[1] = 0.2; - freq_factors[2] = 0.4; - freq_factors[3] = 0.6; - freq_factors[4] = 0.8; - freq_factors[5] = 1.0; - - float error_freq = freq_factors[error_scale]; - - deviceToHostCopy(x); - - unsigned char* data_arr = reinterpret_cast<unsigned char*>(x->host_data); - // FIXIT: Need to be careful about floating point datatype assumptions - long int total_bytes = x->size_in_bytes; - long int error_iterations = total_bytes * 0.01 * error_freq; - INFO("total_bytes = %lu, error_iterations = %lu \n", total_bytes, error_iterations); - - srand(time(NULL)); - - for(int i = 0; i < error_iterations; i++){ - // FIXIT: The rand() is only specific to int - need long - long int index = rand() % total_bytes; - int N = 5; // The operation below flips the Nth bit - unsigned char fil = 1UL << N; - unsigned char val = data_arr[index]; - char flipped = val^fil; - data_arr[i] = flipped; - } - - - Norm_t* norms = calculateNorms2(x, x_original); - - - profileEvent("tensorBitError_end", true); - - return (void*) norms; - -} - - -void randomCeilAndFloor(float* x, size_t num_elems){ - - INFO("randomCeilAndFloor\n"); - - std::random_device rd; - std::mt19937 mt(rd()); - std::normal_distribution<float> distribution(0.0, 1.0); - - for(size_t i = 0; i < num_elems; i++){ - float rand_num = distribution(mt); - int val = abs(((int) rand_num) % 2); - if(val == 0) - x[i] = floor(x[i]); - else if(val == 1) - x[i] = ceil(x[i]); - } - -} - -// Routine for Adding RoundOff Errors -void* addRoundError(void* x_ptr, int error_scale){ - - if(error_scale > 11 || error_scale < 0){ - ERROR("Error Scale out of bounds \n"); - } - - INFO("*** TensorRoundError \n"); - profileEvent("tensorRoundError"); - - Tensor* x = (Tensor*) x_ptr; - - size_t* dim_sizes = x->dims.dim_sizes; - Tensor* x_original = (Tensor*) create4DTensor(x->data_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - - // Copying x data into x_original - for computing Norms - tensorCopy(x, x_original); - - float round_factors[12]; - round_factors[0] = 1000000; // FIXIT: This should be zero error - round_factors[1] = 100; - round_factors[2] = 10; - round_factors[3] = 7; // Beyond this point, the error function is linear - round_factors[4] = 3; - round_factors[5] = 1; - round_factors[6] = 0.7; - round_factors[7] = 0.3; - round_factors[8] = 0.1; - round_factors[9] = 0.07; - round_factors[10] = 0.03; - round_factors[11] = 0.01; - - // THINK: Considering using error magnitudes in this scenario - - - float round_factor = round_factors[error_scale]; - INFO("round_factor = %f \n", round_factor); - - hostToDeviceCopy(x); - - int blockSize = 128; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); - INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); - - // NOTE: Check if a large gridSize will work with really large tensors - vecConstMul<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems); - //vecRound<<<gridSize, blockSize>>>((float*) x->gpu_data, x->num_elems); - - deviceToHostCopy(x); - randomCeilAndFloor((float*) x->host_data, x->num_elems); - hostToDeviceCopy(x); - - vecConstDiv<<<gridSize, blockSize>>>((float*) x->gpu_data, round_factor, x->num_elems); - - Norm_t* norms = calculateNorms2(x, x_original); - - profileEvent("tensorRoundError_end", true); - - return (void*) norms; -} - - - - -// Routine for Adding Gaussian Error -void* addGaussianError(void* x_ptr, int error_scale){ - - if(error_scale > 20 || error_scale < 0){ - ERROR("Error Scale out of bounds \n"); - } - - INFO("*** TensorAddError \n"); - profileEvent("tensorAddError"); - - Tensor* x = (Tensor*) x_ptr; - - size_t* dim_sizes = x->dims.dim_sizes; - Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - - Tensor* x_original = (Tensor*) create4DTensor(x->cur_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - - // Copying x data into x_original - for computing Norms - tensorCopy(x, x_original); - - // NOTE: Error scale is used to generate the bias matrix - initRandValues(bias, error_scale); - - hostToDeviceCopy(x); - //hostToDeviceCopy(bias); - - - int blockSize = 1024; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); - INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); - - // NOTE: Check if a large gridSize will work with really large tensors - vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems); - - float alpha = 1.0f; - - // FIXIT: routine fails for 3D tensors - checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc, - bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data)); - - - //Norm_t* norms = calculateNorms2(x, x_original); - //Norm_t* norms = calculateNormsGPU(x, x_original); - - Norm_t* norms = calculateNormsTreeReduction(x, x_original); - - freeTensor(x_original); - freeTensor(bias); - - - profileEvent("tensorAddError_end", true); - - return (void*) norms; -} - - - -void initPromiseRandValues(Tensor* bias, int error_scale){ +void initPromiseRandValues(Tensor *bias, int error_scale) { float scaling_values[10]; @@ -859,98 +520,91 @@ void initPromiseRandValues(Tensor* bias, int error_scale){ scaling_values[2] = 0.336; scaling_values[3] = 0.21; scaling_values[4] = 0.168; - scaling_values[5] = 0.14; + scaling_values[5] = 0.14; scaling_values[6] = 0.11; scaling_values[7] = 0.0784; scaling_values[8] = 0.005; scaling_values[9] = 0.000; - curandGenerator_t gen; struct timespec ts; - if(timespec_get(&ts, TIME_UTC) == 0){ + if (timespec_get(&ts, TIME_UTC) == 0) { printf("crashed \n"); abort(); } curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec); - curandGenerateNormal(gen, - (float*) bias->gpu_data, - bias->num_elems, 0.0, - 1.0 * scaling_values[error_scale]); - + curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec ^ ts.tv_sec); + curandGenerateNormal(gen, (float *)bias->gpu_data, bias->num_elems, 0.0, + 1.0 * scaling_values[error_scale]); } - // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16 // Routine for Adding PROMISE bitline swing error -void* addPromiseError(void* x_ptr, int error_scale){ +void *addPromiseError(void *x_ptr, int error_scale) { - if(error_scale > 10 || error_scale < 0){ + if (error_scale > 10 || error_scale < 0) { ERROR("Error Scale out of bounds for PROMISE - 8 Swing values \n"); } - - INFO("*** addPromiseError \n"); + + INFO("*** addPromiseError \n"); profileEvent("addPromiseError"); - Tensor* x = (Tensor*) x_ptr; - - size_t* dim_sizes = x->dims.dim_sizes; - Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - + Tensor *x = (Tensor *)x_ptr; + + size_t *dim_sizes = x->dims.dim_sizes; + Tensor *bias = + (Tensor *)create4DTensor(x->cur_type, x->data_format, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); + // NOTE: Error scale is used to generate the bias matrix - initPromiseRandValues(bias, error_scale); + initPromiseRandValues(bias, error_scale); hostToDeviceCopy(x); - //hostToDeviceCopy(bias); - + // hostToDeviceCopy(bias); + int blockSize = 1024; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); + int gridSize = (int)ceil((float)x->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); // NOTE: Check if a large gridSize will work with really large tensors - vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems); - + vecMul<<<gridSize, blockSize>>>((float *)x->gpu_data, (float *)bias->gpu_data, + x->num_elems); + float alpha = 1.0f; - //float beta = 0.0f; + // float beta = 0.0f; checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc, - bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data)); + bias->gpu_data, &alpha, x->tensor_desc, + x->gpu_data)); profileEvent("addPromiseError_end", true); - - return (void*) x; -} - - + return (void *)x; +} -__global__ void quantizeAndClip(float* A, int n, float mul_factor, float min, float max){ +__global__ void quantizeAndClip(float *A, int n, float mul_factor, float min, + float max) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n){ + if (id < n) { int temp = (A[id] - min) / mul_factor; float result = temp * 1.0 * mul_factor; result = result + min; A[id] = result; - if(A[id] > max){ + if (A[id] > max) { A[id] = max; } - if(A[id] < min){ + if (A[id] < min) { A[id] = min; } - } } - -__global__ void quantizeElem(float* A, int n, float mul_factor, float min){ +__global__ void quantizeElem(float *A, int n, float mul_factor, float min) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n){ + if (id < n) { int temp = (A[id] - min) / mul_factor; float result = temp * 1.0 * mul_factor; result = result + min; @@ -958,44 +612,27 @@ __global__ void quantizeElem(float* A, int n, float mul_factor, float min){ } } - -void* quantizeTensorPromise(void* input_ptr, float min, float max){ +void *quantizeTensorPromise(void *input_ptr, float min, float max) { INFO("QuantizeTensorPROMISE \n"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; - int quantize_range = 256; float input_range = max - min; float mul_factor = input_range / quantize_range; INFO("mul_factor = %f \n", mul_factor); int blockSize = 1024; - int gridSize = (int) ceil ((float) input->num_elems / blockSize); + int gridSize = (int)ceil((float)input->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); hostToDeviceCopy(input); - quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data, - input->num_elems, mul_factor, min, max); + quantizeAndClip<<<gridSize, blockSize>>>( + (float *)input->gpu_data, input->num_elems, mul_factor, min, max); - return input; } - - -void* tensorAddError(void* x_ptr, int error_scale){ - - void * new_x = addGaussianError(x_ptr, error_scale); - //void * new_x = addRoundError(x_ptr, error_scale); - //void * new_x = addBitError(x_ptr, error_scale); - return new_x; -} - - - - } - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu index 4392839f7f6dbca8df4352a19fdd689d6f8e3d5e..00334f8ecc821fdb3209e48aa94785aad0a54f37 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu @@ -1,7 +1,7 @@ //===--------------------------- fp16_gemm.cu -----------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of quantization kernels. // This helps HPVM to switch compute precision for tensor operations between // FP32 and FP16. @@ -17,236 +17,199 @@ #include <cuda_fp16.h> #include "fp16_emu.h" - - inline cudaError_t checkCuda(cudaError_t result) { - if (result != cudaSuccess) - std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n"; - return result; + if (result != cudaSuccess) + std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n"; + return result; } inline cublasStatus_t checkCublas(cublasStatus_t result) { - if (result != CUBLAS_STATUS_SUCCESS) - std::cerr << "cuBLAS Error: " << result << "\n"; - return result; + if (result != CUBLAS_STATUS_SUCCESS) + std::cerr << "cuBLAS Error: " << result << "\n"; + return result; } template <typename T> -inline void printArray(const T * const __restrict__ array, +inline void printArray(const T *const __restrict__ array, const unsigned elements) { - for (unsigned i = 0; i < elements; i++) - std::cout << std::to_string(array[i]) << "\n"; + for (unsigned i = 0; i < elements; i++) + std::cout << std::to_string(array[i]) << "\n"; } // initialization template <typename T> -__global__ void initKernel(T * const __restrict__ array, +__global__ void initKernel(T *const __restrict__ array, const unsigned elements) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - array[idx] = 1.2; + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + array[idx] = 1.2; } template <typename T> -void init(T * const __restrict__ array, - const unsigned elements) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - initKernel<<<num_blocks, block_size>>>(array, elements); - checkCuda(cudaDeviceSynchronize()); +void init(T *const __restrict__ array, const unsigned elements) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + initKernel<<<num_blocks, block_size>>>(array, elements); + checkCuda(cudaDeviceSynchronize()); } // float to half -__global__ void f2hKernel(const float * const __restrict__ input, +__global__ void f2hKernel(const float *const __restrict__ input, const unsigned elements, - half * const __restrict__ output) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - output[idx] = __float2half_rn(input[idx]); + half *const __restrict__ output) { + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + output[idx] = __float2half_rn(input[idx]); } -void f2h(const float * const __restrict__ input, - const unsigned elements, - half * const __restrict__ output) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - f2hKernel<<<num_blocks, block_size>>>(input, elements, output); - checkCuda(cudaDeviceSynchronize()); +void f2h(const float *const __restrict__ input, const unsigned elements, + half *const __restrict__ output) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + f2hKernel<<<num_blocks, block_size>>>(input, elements, output); + checkCuda(cudaDeviceSynchronize()); } // half to float -__global__ void h2fKernel(const half * const __restrict__ input, +__global__ void h2fKernel(const half *const __restrict__ input, const unsigned elements, - float * const __restrict__ output) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - output[idx] = __half2float(input[idx]); + float *const __restrict__ output) { + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + output[idx] = __half2float(input[idx]); } -void h2f(const half * const __restrict__ input, - const unsigned elements, - float * const __restrict__ output) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - h2fKernel<<<num_blocks, block_size>>>(input, elements, output); - checkCuda(cudaDeviceSynchronize()); +void h2f(const half *const __restrict__ input, const unsigned elements, + float *const __restrict__ output) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + h2fKernel<<<num_blocks, block_size>>>(input, elements, output); + checkCuda(cudaDeviceSynchronize()); } -void sgemm(const float * const __restrict__ a, - const unsigned num_rows_a, - const unsigned num_cols_a, - const float * const __restrict__ b, - const unsigned num_rows_b, - const unsigned num_cols_b, - float * const __restrict__ c) { - const unsigned iterations = 10; - float kernel_time; - cudaEvent_t start; - cudaEvent_t stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cublasHandle_t handle; - checkCublas(cublasCreate(&handle)); - - // Enable Tensor Cores - checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); - - const float alpha_ = 1.0; - const float beta_ = 0.0; - const float *alpha = &alpha_; - const float *beta = &beta_; - - cudaEventRecord(start, 0); - for (unsigned i = 0; i < iterations; i++) { - checkCublas(cublasGemmEx(handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - // Dimensions - num_rows_a, - num_cols_b, - num_cols_a, - alpha, - // A - a, - CUDA_R_32F, - num_rows_a, - // B - b, - CUDA_R_32F, - num_rows_b, - beta, - // C - c, - CUDA_R_32F, - num_rows_a, - // Compute precision and algorithm - CUDA_R_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&kernel_time, start, stop); - - std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) << " ms\n"; +void sgemm(const float *const __restrict__ a, const unsigned num_rows_a, + const unsigned num_cols_a, const float *const __restrict__ b, + const unsigned num_rows_b, const unsigned num_cols_b, + float *const __restrict__ c) { + const unsigned iterations = 10; + float kernel_time; + cudaEvent_t start; + cudaEvent_t stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cublasHandle_t handle; + checkCublas(cublasCreate(&handle)); + + // Enable Tensor Cores + checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + + const float alpha_ = 1.0; + const float beta_ = 0.0; + const float *alpha = &alpha_; + const float *beta = &beta_; + + cudaEventRecord(start, 0); + for (unsigned i = 0; i < iterations; i++) { + checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, + // Dimensions + num_rows_a, num_cols_b, num_cols_a, alpha, + // A + a, CUDA_R_32F, num_rows_a, + // B + b, CUDA_R_32F, num_rows_b, beta, + // C + c, CUDA_R_32F, num_rows_a, + // Compute precision and algorithm + CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&kernel_time, start, stop); + + std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) + << " ms\n"; } -void hgemm(const float * const __restrict__ af, - const unsigned num_rows_a, - const unsigned num_cols_a, - const float * const __restrict__ bf, - const unsigned num_rows_b, - const unsigned num_cols_b, - float * const __restrict__ cf) { - const unsigned iterations = 10; - - const unsigned num_elements_a = num_rows_a * num_cols_a; - const unsigned num_elements_b = num_rows_b * num_cols_b; - const unsigned num_elements_c = num_rows_a * num_cols_b; - - float to_fp16_time; - float to_fp32_time; - float kernel_time; - float total_time; - - cudaEvent_t start; - cudaEvent_t stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - half *a; - half *b; - half *c; - - checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a)); - checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b)); - checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c)); - - init(a, num_elements_a); - init(b, num_elements_b); - init(c, num_elements_c); - - // Convert floats to halfs - cudaEventRecord(start, 0); - f2h(af, num_elements_a, a); - f2h(bf, num_elements_b, b); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&to_fp16_time, start, stop); - - cublasHandle_t handle; - checkCublas(cublasCreate(&handle)); - checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); - - const half alpha_ = cpu_float2half_rn(1.0); - const half beta_ = cpu_float2half_rn(0.0); - const half *alpha = &alpha_; - const half *beta = &beta_; - - cudaEventRecord(start, 0); - for (unsigned i = 0; i < iterations; i++) { - checkCublas(cublasGemmEx(handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - // Dimensions - num_rows_a, - num_cols_b, - num_cols_a, - alpha, - // A - a, - CUDA_R_16F, - num_rows_a, - // B - b, - CUDA_R_16F, - num_rows_b, - beta, - // C - c, - CUDA_R_16F, - num_rows_a, - // Compute precision and algorithm - CUDA_R_16F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&kernel_time, start, stop); - - cudaEventRecord(start, 0); - h2f(c, num_elements_c, cf); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&to_fp32_time, start, stop); - - total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time; - std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n"; - std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n"; - std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) << " ms\n"; - std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n"; +void hgemm(const float *const __restrict__ af, const unsigned num_rows_a, + const unsigned num_cols_a, const float *const __restrict__ bf, + const unsigned num_rows_b, const unsigned num_cols_b, + float *const __restrict__ cf) { + const unsigned iterations = 10; + + const unsigned num_elements_a = num_rows_a * num_cols_a; + const unsigned num_elements_b = num_rows_b * num_cols_b; + const unsigned num_elements_c = num_rows_a * num_cols_b; + + float to_fp16_time; + float to_fp32_time; + float kernel_time; + float total_time; + + cudaEvent_t start; + cudaEvent_t stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + half *a; + half *b; + half *c; + + checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a)); + checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b)); + checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c)); + + init(a, num_elements_a); + init(b, num_elements_b); + init(c, num_elements_c); + + // Convert floats to halfs + cudaEventRecord(start, 0); + f2h(af, num_elements_a, a); + f2h(bf, num_elements_b, b); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&to_fp16_time, start, stop); + + cublasHandle_t handle; + checkCublas(cublasCreate(&handle)); + checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + + const half alpha_ = cpu_float2half_rn(1.0); + const half beta_ = cpu_float2half_rn(0.0); + const half *alpha = &alpha_; + const half *beta = &beta_; + + cudaEventRecord(start, 0); + for (unsigned i = 0; i < iterations; i++) { + checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, + // Dimensions + num_rows_a, num_cols_b, num_cols_a, alpha, + // A + a, CUDA_R_16F, num_rows_a, + // B + b, CUDA_R_16F, num_rows_b, beta, + // C + c, CUDA_R_16F, num_rows_a, + // Compute precision and algorithm + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&kernel_time, start, stop); + + cudaEventRecord(start, 0); + h2f(c, num_elements_c, cf); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&to_fp32_time, start, stop); + + total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time; + std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n"; + std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n"; + std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) + << " ms\n"; + std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n"; } - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc index 4902043b7ce6a1240981224d98dc7dac70361500..aeb12e9f6e3fb56bfeaef3bd71bd2c3594fdcc08 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc @@ -47,5 +47,4 @@ std::string profile_data = ""; PerfParamSet *perfParamSet; SampParamSet *sampParamSet; -unsigned int currentTensorID = -1; - +unsigned int currentTensorID = ~0U; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu index 4b49a3702b1938ceed9829cc3572474c7cb82420..6a3fcc12e014205aaf81e2cae0906ed6cfbff33e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu @@ -1,14 +1,14 @@ -//===--------------------------- group_conv.cu -----------------------------===// +//===--------------------------- group_conv.cu +//-----------------------------===// // //===----------------------------------------------------------------------===// -// -// This file group convolutions with FP16 and FP32 compute precisions. +// +// This file group convolutions with FP16 and FP32 compute precisions. // Note that group convolutions, unlike regular convolutions, are not // approximable in any other way in HPVM. // //===----------------------------------------------------------------------===// - #include "tensor_utils.h" #include "fp16_gemm.h" #include "debug.h" @@ -17,31 +17,26 @@ #include "op_overheads.h" #include "error.h" +extern "C" { -extern "C"{ - - +__global__ void depthwise_convNew8( + float *const __restrict__ y, const float *const __restrict__ x, + const float *const __restrict__ w, const int B, const int M, const int H, + const int W, const int KH, const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, const int H_stride, const int W_stride) { -__global__ void depthwise_convNew8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] +#define y4d(i3, i2, i1, i0) \ + y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] +#define x4d(i3, i2, i1, i0) \ + x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] const int num = 8; const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); + + if (m < M) { + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); const int start_h = (tx / W_out) * H_stride - H_pad; const int start_w = (tx % W_out) * W_stride - W_pad; @@ -54,80 +49,73 @@ __global__ void depthwise_convNew8(float* const __restrict__ y, float c5 = 0; float c6 = 0; float c7 = 0; - - const float* weights = &w[m * KH * KW]; + + const float *weights = &w[m * KH * KW]; for (int k = 0; k < KH * KW; k++) { int p = k / KW; int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - if(b + 1 < B) - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - if(b + 2 < B) - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - if(b + 3 < B) - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - if(b + 4 < B) - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - if(b + 5 < B) - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - if(b + 6 < B) - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - if(b + 7 < B) - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - - + if (start_h + p > -1 && start_h + p < H && start_w + q > -1 && + start_w + q < W) { + + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + if (b + 1 < B) + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; + if (b + 2 < B) + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; + if (b + 3 < B) + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; + if (b + 4 < B) + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; + if (b + 5 < B) + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; + if (b + 6 < B) + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; + if (b + 7 < B) + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; } } - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) + y4d(b, m, 0, tx) = c0; + if (b + 1 < B) y4d(b + 1, m, 0, tx) = c1; - if(b + 2 < B) + if (b + 2 < B) y4d(b + 2, m, 0, tx) = c2; - if(b + 3 < B) + if (b + 3 < B) y4d(b + 3, m, 0, tx) = c3; - if(b + 4 < B) + if (b + 4 < B) y4d(b + 4, m, 0, tx) = c4; - if(b + 5 < B) + if (b + 5 < B) y4d(b + 5, m, 0, tx) = c5; - if(b + 6 < B) + if (b + 6 < B) y4d(b + 6, m, 0, tx) = c6; - if(b + 7 < B) + if (b + 7 < B) y4d(b + 7, m, 0, tx) = c7; } - - #undef y4d - #undef x4d -} - - +#undef y4d +#undef x4d +} -__global__ void depthwise_convNew8_half2(__half* const __restrict__ y, - const __half* const __restrict__ x, - const __half* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ +__global__ void depthwise_convNew8_half2( + __half *const __restrict__ y, const __half *const __restrict__ x, + const __half *const __restrict__ w, const int B, const int M, const int H, + const int W, const int KH, const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] +#define y4d(i3, i2, i1, i0) \ + y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] +#define x4d(i3, i2, i1, i0) \ + x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] const int num = 8; const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); + + if (m < M) { + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); const int start_h = (tx / W_out) * H_stride - H_pad; const int start_w = (tx % W_out) * W_stride - W_pad; @@ -136,111 +124,112 @@ __global__ void depthwise_convNew8_half2(__half* const __restrict__ y, __half2 c1 = __half2half2(0); __half2 c2 = __half2half2(0); __half2 c3 = __half2half2(0); - - const __half* weights = &w[m * KH * KW]; + + const __half *weights = &w[m * KH * KW]; for (int k = 0; k < KH * KW; k++) { int p = k / KW; int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - - __half2 t1; - __half2 t2; - __half2 t3; - __half2 t4; - if(b + 7 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q)); - } - else if(b + 6 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); - - } - else if(b + 5 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - } - else if(b + 4 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); - - } - else if(b + 3 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - } - else if(b + 2 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); - - } - else if(b + 1 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - } - else{ - t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); - - } - - - c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); - c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); - c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); - c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); - + if (start_h + p > -1 && start_h + p < H && start_w + q > -1 && + start_w + q < W) { + + __half2 t1; + __half2 t2; + __half2 t3; + __half2 t4; + if (b + 7 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), + x4d(b + 6, m, start_h + p, start_w + q)); + } else if (b + 6 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); + + } else if (b + 5 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + } else if (b + 4 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); + + } else if (b + 3 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + } else if (b + 2 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); + + } else if (b + 1 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + } else { + t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); + } + + c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); + c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); + c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); + c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); } } - y4d(b, m, 0, tx) = __high2half(c0); - if(b + 1 < B) + y4d(b, m, 0, tx) = __high2half(c0); + if (b + 1 < B) y4d(b + 1, m, 0, tx) = __low2half(c0); - if(b + 2 < B) + if (b + 2 < B) y4d(b + 2, m, 0, tx) = __high2half(c1); - if(b + 3 < B) + if (b + 3 < B) y4d(b + 3, m, 0, tx) = __low2half(c1); - if(b + 4 < B) + if (b + 4 < B) y4d(b + 4, m, 0, tx) = __high2half(c2); - if(b + 5 < B) + if (b + 5 < B) y4d(b + 5, m, 0, tx) = __low2half(c2); - if(b + 6 < B) + if (b + 6 < B) y4d(b + 6, m, 0, tx) = __high2half(c3); - if(b + 7 < B) + if (b + 7 < B) y4d(b + 7, m, 0, tx) = __low2half(c3); } - - #undef y4d - #undef x4d -} +#undef y4d +#undef x4d +} - -void* tensorConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - +void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups) { INFO("*** TensorConvolution \n"); profileEvent("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - Tensor* output; + Tensor *output; hostToDeviceCopy(input); hostToDeviceCopy(filter); @@ -248,43 +237,43 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - if (conv_groups > 32) { - // TODO: Support other cases; + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - int n, c, h, w; // output dimensions + int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; c = input->dims.dim_sizes[1]; const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / + horizontal_stride + + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - - int blockSize; blockSize = 64; - - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + + dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize); dim3 block(blockSize); - depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); + depthwise_convNew8<<<grid, block>>>( + (float *)output->gpu_data, (float *)input->gpu_data, + (float *)filter->gpu_data, input->dims.dim_sizes[0], + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); - } - else { + } else { cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; @@ -297,152 +286,119 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - // TODO: Support other cases; + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - cudnnDataType_t computeType = CUDNN_DATA_FLOAT; // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE + // should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H " + "= %d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); if (convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN + // support is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // FIXIT: Algo shouldn't be hardcoded convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); } cudaDeviceSynchronize(); profileEvent("Conv_end", true); - - #ifdef ERROR_INJECTION_ENABLED - - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } - - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - - op_counter++; - - #endif - return output; - - } // FIXME: Need to properly fix the new HALF type conversion -void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ +void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { INFO("*** TensorHConvolution \n"); profileEvent("#Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -454,33 +410,34 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - // Float-Half Conversions profileEvent("F2H_start"); convertToFP16(input); - convertToFP16(filter); + convertToFP16(filter); profileEvent("F2H_end"); /******* END OF INPUT DATA CONVERSIONS*/ - Tensor *output; - if(conv_groups > 1){ + if (conv_groups > 1) { int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - + int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + + 1; + int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / + horizontal_stride + + 1; + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor @@ -488,117 +445,90 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int blockSize; blockSize = 128; - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize); dim3 block(blockSize); - depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data, - (__half*) input->gpu_half_data, - (__half*) filter->gpu_half_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); + depthwise_convNew8_half2<<<grid, block>>>( + (__half *)output->gpu_half_data, (__half *)input->gpu_half_data, + (__half *)filter->gpu_half_data, input->dims.dim_sizes[0], + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); cudaDeviceSynchronize(); - - } - else{ + } else { checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } - + // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_half_desc, - filter->filter_half_desc, - &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_half_desc, filter->filter_half_desc, &n, &c, &h, + &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, - output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W " + "= %d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - // NOTE: The following algo works with TRUE half precision convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_half_desc, - filter->filter_half_desc, - convDesc, - output->tensor_half_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, + convDesc, output->tensor_half_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace DEBUG("workspace size = %d \n", workspace_size); - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input->tensor_half_desc, - input->gpu_half_data, - filter->filter_half_desc, - filter->gpu_half_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, - output->tensor_half_desc, - output->gpu_half_data)); - + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data, + filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo, + workspace, workspace_size, &beta, output->tensor_half_desc, + output->gpu_half_data)); } - + profileEvent("H2F_start"); convertToFP32_offline(output); - - profileEvent("H2F_end"); + profileEvent("H2F_end"); profileEvent("#Conv_end"); - return output; - } - - - -}// End of Extern C - +} // End of Extern C diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu index e706080051a41dac1f7486027fcb9225793921bf..8324b18e044b37ee697a624e60ec77eb4bc7a8d5 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu @@ -1,9 +1,11 @@ -//===--------------------------- half_precision_api.cu --------------------------===// +//===--------------------------- half_precision_api.cu +//--------------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the custom implementation of tensor precision changing -// kernels useful for approximated and non-approximated versions of tensor +// +// This file consists of the custom implementation of tensor precision +// changing +// kernels useful for approximated and non-approximated versions of tensor // operations. This file also contains API for tensor operations operating on // tensors with half-precision. // @@ -12,7 +14,6 @@ #ifndef HALF_API_HEADER #define HALF_API_HEADER - #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -37,7 +38,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "../include/tensor_runtime.h" #include "../include/tensor_utils.h" @@ -48,15 +48,13 @@ #include "../include/fp16_gemm.h" #include "../include/fp16_conversion.h" - - -void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ +void *tensorHalfGemm(void *lhs_ptr, void *rhs_ptr) { INFO("*** TensorHalfGemm \n"); profileEvent("#Mul"); - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); @@ -64,65 +62,60 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ hostToDeviceCopy(lhs); hostToDeviceCopy(rhs); - profileEvent("F2H_start"); convertToFP16(lhs); convertToFP16(rhs); - - profileEvent("F2H_end"); + profileEvent("F2H_end"); // 'm' holds the batch dimension - assuming NCHW format Tensors int m = lhs->dims.dim_sizes[0]; // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int k = 1; - for (int j = 1 ; j < lhs->dims.num_dims; j++){ + for (int j = 1; j < lhs->dims.num_dims; j++) { k = k * lhs->dims.dim_sizes[j]; // input neurons } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; // Dimension-note: Check if k is same across the two tensors DEBUG("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ + if (rhs_k != k) { ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); } - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines - Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, - m, n, 1, 1); + // NOTE: Creating a 4D tensor to be compatible with later called cuDNN + // routines + Tensor *output = + (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, m, n, 1, 1); changeTensorPlacement(output, DEVICE); - //convertToFP16(output); - + // convertToFP16(output); // INFO: cuBlas uses column-major format // INFO: The leading dimension is just the FIRST Dimension - // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects + // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN + // expects const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n, m, k, - alpha_half, - (__half*) rhs->gpu_half_data, CUDA_R_16F, n, - (__half*) lhs->gpu_half_data, CUDA_R_16F, k, - beta_half, - (__half*) output->gpu_half_data, CUDA_R_16F, n, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha_half, + (__half *)rhs->gpu_half_data, CUDA_R_16F, n, (__half *)lhs->gpu_half_data, + CUDA_R_16F, k, beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); profileEvent("H2F_start"); convertToFP32_offline(output); - //h2f((half*) output_half->gpu_data, output->num_elems, (float*) output->gpu_data); + // h2f((half*) output_half->gpu_data, output->num_elems, (float*) + // output->gpu_data); profileEvent("H2F_end"); @@ -131,32 +124,28 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ return output; } - - -void* tensorHalfGemmGPU(void* lhs_ptr, void* rhs_ptr){ +void *tensorHalfGemmGPU(void *lhs_ptr, void *rhs_ptr) { return tensorHalfGemm(lhs_ptr, rhs_ptr); } - - // FIXIT: Generalize all of the routines for types {half, float, double} -void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ +void *tensorHalfConvolution(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { INFO("*** TensorHConvolution \n"); profileEvent("#Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - if(conv_mode == 0) + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -168,7 +157,6 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - /***** CONVERSIONS from FP32 to FP16 - on the GPU */ profileEvent("F2H_start"); @@ -178,95 +166,76 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, profileEvent("F2H_end"); /******* END OF INPUT DATA CONVERSIONS*/ - - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } - + // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - // FIXIT: Think if upscaling values need to be configurable? // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE + // should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor((cudnnDataType_t)half_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - //convertToFP16(output); + // convertToFP16(output); - // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, - output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = " + "%d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_half_desc == NULL || - filter->filter_half_desc == NULL || output->tensor_half_desc == NULL) + if (convDesc == NULL || input->tensor_half_desc == NULL || + filter->filter_half_desc == NULL || output->tensor_half_desc == NULL) ERROR("NULL descriptor! \n"); - // NOTE: The following algo works with TRUE half precision convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_half_desc, - filter->filter_half_desc, - convDesc, - output->tensor_half_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, convDesc, + output->tensor_half_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace DEBUG("workspace size = %d \n", workspace_size); - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input->tensor_half_desc, - input->gpu_half_data, - filter->filter_half_desc, - filter->gpu_half_data, - convDesc, convAlgo, - workspace, workspace_size, - &beta, - output->tensor_half_desc, - output->gpu_half_data)); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data, + filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo, + workspace, workspace_size, &beta, output->tensor_half_desc, + output->gpu_half_data)); profileEvent("H2F_start"); @@ -279,21 +248,18 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, return output; } - - - -void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { INFO("*** TensorHalfBatchNorm \n"); profileEvent("#BatchNorm"); - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; - + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + Tensor *mean = (Tensor *)mean_ptr; + Tensor *variance = (Tensor *)variance_ptr; + float alpha_val = 1.0f, beta_val = 0.0f; hostToDeviceCopy(input); hostToDeviceCopy(gamma); @@ -301,56 +267,37 @@ void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, hostToDeviceCopy(mean); hostToDeviceCopy(variance); - profileEvent("F2H_start"); convertToFP16(input); profileEvent("F2H_end"); - - - - checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL, - &alpha_val, &beta_val, - input->tensor_half_desc, - input->gpu_half_data, - input->tensor_half_desc, - input->gpu_half_data, - gamma->tensor_desc, gamma->gpu_data, - beta->gpu_data, mean->gpu_data, - variance->gpu_data, epsilon)); - + checkCUDNN(cudnnBatchNormalizationForwardInference( + cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val, + input->tensor_half_desc, input->gpu_half_data, input->tensor_half_desc, + input->gpu_half_data, gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, + mean->gpu_data, variance->gpu_data, epsilon)); profileEvent("H2F_start"); convertToFP32_offline(input); - - profileEvent("H2F_end"); + profileEvent("H2F_end"); - profileEvent("#tensorHalfBatchNorm_end", true); - return input; } - - - -void* tensorHalfPooling(void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ - - +void *tensorHalfPooling(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { INFO("*** TensorHalfPooling \n"); profileEvent("#Pool"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); @@ -366,218 +313,185 @@ void* tensorHalfPooling(void* input_ptr, // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; - int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride; + int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / + vertical_stride; h = h + 1; - int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride; + int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / + horizontal_stride; w = w + 1; DEBUG("n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); // FIXIT: Don't be specific to floats - Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w); // Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - //convertToFP16(output); + // convertToFP16(output); // FIXIT: Fix being specific to CUDNN_DATA_FLOAT and NCHW format // FIXIT: Is this setTensor even needed? checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_half_desc, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_HALF, - n, c, - h, w)); + CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n, + c, h, w)); cudnnPoolingMode_t pool_mode; - if(poolFunction == 0) + if (poolFunction == 0) pool_mode = CUDNN_POOLING_MAX; - else if(poolFunction == 1) + else if (poolFunction == 1) pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - // FIXIT: Make the pool function (max, min, avg) configurable - checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc, - pool_mode, - CUDNN_PROPAGATE_NAN, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride)); - - - checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, - input->tensor_half_desc, - input->gpu_half_data, &beta, - output->tensor_half_desc, output->gpu_half_data)); - + checkCUDNN(cudnnSetPooling2dDescriptor( + poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width, + vertical_pad, horizontal_pad, vertical_stride, horizontal_stride)); + checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, + input->tensor_half_desc, input->gpu_half_data, + &beta, output->tensor_half_desc, + output->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(output); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfPooling_end", true); return output; } - - - - -void* tensorHalfRelu2(void* input_ptr, float min, float max){ +void *tensorHalfRelu2(void *input_ptr, float min, float max) { INFO("*** TensorClippedRelu \n"); profileEvent("#Relu"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Floating point to half conversions profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /*** End of data type conversion **/ - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); - checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, - CUDNN_PROPAGATE_NAN, 2.0)); - - checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnSetActivationDescriptor( + reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, 2.0)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, reluDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); profileEvent("H2F_start"); // NOTE: Transforming half precision output to single precision convertToFP32_offline(input); - + profileEvent("H2F_end"); profileEvent("#tensorHalfClippedRelu_end"); - return input; } - - - -void* tensorHalfRelu(void* input_ptr){ +void *tensorHalfRelu(void *input_ptr) { INFO("*** TensorHalfRelu \n"); profileEvent("#Relu"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Floating point to half conversions profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /*** End of data type conversion **/ - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, reluDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); - profileEvent("H2F_start"); convertToFP32_offline(input); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfRelu_end"); - return input; } - - - - - -void* tensorHalfTanh(void* input_ptr){ +void *tensorHalfTanh(void *input_ptr) { INFO("*** TensorHalfTanh \n"); profileEvent("#Tanh"); - - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t tanhDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Data conversion from float to half profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /**** End of data type conversion ****/ - checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc)); checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, tanhDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(input); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfTanh_end"); - return input; } +void *tensorHalfAdd(void *x_ptr, void *bias_ptr) { - -void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ - - Tensor* x = (Tensor*) x_ptr; - Tensor* bias = (Tensor*) bias_ptr; + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; INFO("*** TensorHalfAdd \n"); profileEvent("#Add"); @@ -587,36 +501,29 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ hostToDeviceCopy(x); hostToDeviceCopy(bias); - //**** Data conversion from float to half profileEvent("F2H_start"); convertToFP16(x); convertToFP16(bias); - + profileEvent("F2H_end"); /*** End of data type conversions ****/ - // FIXIT: routine fails for 3D tensors checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_half_desc, - bias->gpu_half_data, &alpha, - x->tensor_half_desc, x->gpu_half_data)); - + bias->gpu_half_data, &alpha, x->tensor_half_desc, + x->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(x); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfAdd_end"); - return x; } - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index 02d1747328b65f30a20b2db2eecdb0f06f711bdf..5e1fbc99197af7797620f80ffbbc5aa41ee63517 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -1,48 +1,45 @@ -//===--------------------------- hpvm-rt-controller.cpp ---------------------===// +//===--------------------------- hpvm-rt-controller.cpp +//---------------------===// // //===----------------------------------------------------------------------===// -// -// This file contains code for that allows the tensor runtime to adapt +// +// This file contains code for that allows the tensor runtime to adapt // in response to external changes in conditions (such as frequency changes) // by helping to choose correct approximation configurations. It also provides // routines for the rest of the runtime to get performance and energy profiling. // //===----------------------------------------------------------------------===// - #include "hpvm-rt-controller.h" -#include "img_tensor_utils.h" #include "global_data.h" #include <fstream> //-------- Functionality to read and update frequency on Jetson board -------// /*const char* available_freqs[] = {"140250000", "229500000", "318750000", - "408000000", "497250000", "586500000", + "408000000", "497250000", "586500000", "675750000", "765000000", "854250000", "943500000", "1032750000", "1122000000", "1211250000", "1300500000"}; */ - const int available_freqs[] = { -140250000, // 0 -229500000, // 1 -318750000, // 2 -408000000, // 3 -497250000, // 4 -586500000, // 5 -675750000, // 6 -765000000, // 7 -854250000, // 8 -943500000, // 9 -1032750000,// 10 -1122000000,// 11 -1211250000,// 12 -1300500000 // 13 + 140250000, // 0 + 229500000, // 1 + 318750000, // 2 + 408000000, // 3 + 497250000, // 4 + 586500000, // 5 + 675750000, // 6 + 765000000, // 7 + 854250000, // 8 + 943500000, // 9 + 1032750000, // 10 + 1122000000, // 11 + 1211250000, // 12 + 1300500000 // 13 }; - /*void updateJetsonGPUFreq(int freq_level) { if (freq_level < 0 || freq_level > 13) { @@ -50,7 +47,7 @@ const int available_freqs[] = { abort(); } - const char* freq_val = available_freqs[freq_level]; + const char* freq_val = available_freqs[freq_level]; printf("freq-val[0] = %s \n", freq_val); FILE* max_file = @@ -60,7 +57,7 @@ const int available_freqs[] = { } fwrite(freq_val, strlen(freq_val), 1, max_file); fclose(max_file); - + FILE* min_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+"); if (min_file == NULL){ @@ -81,7 +78,7 @@ unsigned long int readJetsonGPUFreq() { char buf[50]; char* ptr; - + fread(buf, 50, 1, cur_freq_file); unsigned long cur_freq = strtoul(buf, &ptr, 10); fclose(cur_freq_file); @@ -90,14 +87,15 @@ unsigned long int readJetsonGPUFreq() { */ - // Sets frequency void setFreq(unsigned freq_index) { unsigned target_freq = available_freqs[freq_index]; - - const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; - const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; + + const char *const min_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; + const char *const max_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; std::ofstream min_stream; std::ofstream max_stream; @@ -116,7 +114,8 @@ void setFreq(unsigned freq_index) { unsigned recordFreq() { // Current frequency file - const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; + const char *const cur_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; std::ifstream cur_stream; cur_stream.open(cur_freq_file, std::ifstream::in); @@ -129,10 +128,6 @@ unsigned recordFreq() { return cur_freq; } - - - - //---------------------------------------------------------------------------// /* @@ -146,13 +141,13 @@ bool fileExists(const std::string &file) { // There will be no frequency request for the first batch // Therefore, we skip the first element by initializing to 1, not 0. -FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) : - idx_list(il), rep_factor(rf), count(1), idx(0) {} +FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) + : idx_list(il), rep_factor(rf), count(1), idx(0) {} unsigned FrequencyIndexList::getNextIndex() { if (count == rep_factor) { count = 0; - idx = (idx+1) % idx_list.size(); + idx = (idx + 1) % idx_list.size(); } count++; return idx_list[idx]; @@ -219,7 +214,7 @@ void ProfileInfo::readIterationFrequency() { frequency_current_iteration = recordFreq(); #else frequency_current_iteration = 0; -#endif //JETSON_EXECUTION +#endif // JETSON_EXECUTION } unsigned long ProfileInfo::getIterationFrequency() { @@ -288,15 +283,14 @@ void ProfileInfo::printToFile() { // to have equal sizes, in outer and inner vectors both, // and all time_info and energy_info vectors must have the same size. unsigned iterations = tensor_time_info.size(); - CUSTOM_ASSERT( - (tensor_time_info.size() == iterations) && - (tensor_energy_info.size() == iterations) && - (control_time_info.size() == iterations) && - (control_energy_info.size() == iterations) && - (config_time_info.size() == iterations) && - (config_energy_info.size() == iterations) && - (frequency_info.size() == iterations) && - "time_info, energy_info, frequency_info size: \ + CUSTOM_ASSERT((tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + (frequency_info.size() == iterations) && + "time_info, energy_info, frequency_info size: \ iteration number does not match."); for (unsigned i = 0; i < tensor_time_info.size(); i++) { @@ -346,8 +340,8 @@ ProfileInfo::ProfileInfo() time_control_current_iteration(0.0), time_config_current_iteration(0.0), energy_compute_current_iteration(0.0), energy_control_current_iteration(0.0), - energy_config_current_iteration(0.0), - frequency_current_iteration(0), in_iteration(false) {} + energy_config_current_iteration(0.0), frequency_current_iteration(0), + in_iteration(false) {} Slowdowns::Slowdowns() { idx = 0; @@ -389,52 +383,50 @@ void RuntimeController::stop_profiler() { profiler->stop_profiler(); } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getSpeedupConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getSpeedupConfigurations() { return SpeedupConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getEnergyConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getEnergyConfigurations() { return EnergyConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getThreeDCurveConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getThreeDCurveConfigurations() { return ThreeDCurveConfigurations; } // For testing purposes only - do not use widely unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; } double RuntimeController::getCurrentConfigurationSpeedup() { - return (double) (*Configurations)[configurationIdx]->speedup; + return (double)(*Configurations)[configurationIdx]->speedup; } double RuntimeController::getCurrentConfigurationEnergy() { - return (double) (*Configurations)[configurationIdx]->energy; + return (double)(*Configurations)[configurationIdx]->energy; } double RuntimeController::getCurrentConfigurationAccuracy() { - return (double) (*Configurations)[configurationIdx]->accuracy; + return (double)(*Configurations)[configurationIdx]->accuracy; } double RuntimeController::getCurrentConfigurationAccuracyLoss() { - return (double) (*Configurations)[configurationIdx]->accuracyLoss; + return (double)(*Configurations)[configurationIdx]->accuracyLoss; } NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) { // if visc.node.id Not specified for this HPVM Node - if (currentTensorID == -1){ + if (currentTensorID == ~0U) { std::string s(data); // All nodes are expected to have a configuration return (*Configurations)[configurationIdx]->setup.at(s); - } - else{ - DEBUG("-- currentTensorID = \%u \n", currentTensorID); + } else { + DEBUG("-- currentTensorID = %u \n", currentTensorID); return (*Configurations)[configurationIdx]->idConfigMap.at(currentTensorID); } - } void RuntimeController::init(const char *Cstr) { @@ -443,7 +435,8 @@ void RuntimeController::init(const char *Cstr) { setProfileInfoFilename(Cstr); readConfigurationFile(Cstr); - // NOTE: Configurations is pareto-configs. InitialConfigurations is the full list (config file) + // NOTE: Configurations is pareto-configs. InitialConfigurations is the full + // list (config file) Configurations = NULL; computeParetoConfigurationPoints(); // compute3DParetoConfigurationPoints(); Not using 3D curve @@ -464,8 +457,10 @@ void RuntimeController::init(const char *Cstr) { // Pseudo random variable (when we did few experiments) // or true random numbers for probabilistic control pseudo_rd = 0.0; - std::random_device rd; //Will be used to obtain a seed for the random number engine - generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd() + std::random_device + rd; // Will be used to obtain a seed for the random number engine + generator = + std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd() distr = std::uniform_real_distribution<>(0.0, 1.0); g_freq = available_freqs[13]; @@ -487,8 +482,8 @@ void RuntimeController::end_iteration() { PI->end_iteration(); } -void RuntimeController::addToCurrentIterationComputeTime( - const char *s, double t) { +void RuntimeController::addToCurrentIterationComputeTime(const char *s, + double t) { if (PI) PI->addToCurrentIterationComputeTime(s, t); } @@ -503,8 +498,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) { PI->addToCurrentIterationConfigTime(t); } -void RuntimeController::addToCurrentIterationComputeEnergy( - const char *s, double e) { +void RuntimeController::addToCurrentIterationComputeEnergy(const char *s, + double e) { if (PI) PI->addToCurrentIterationComputeEnergy(s, e); } @@ -542,8 +537,8 @@ void RuntimeController::updateFrequency() { //--- updateJetsonGPUFreq(freq_idx); setFreq(freq_idx); - -#endif //JETSON_EXECUTION + +#endif // JETSON_EXECUTION } void RuntimeController::writeProfileInfo() { @@ -576,11 +571,9 @@ std::pair<double, double> RuntimeController::fc_profile( const unsigned num_rows_a, const unsigned num_cols_a, const unsigned num_rows_b, const unsigned num_cols_b, const unsigned voltage_swing, const unsigned patch_factor) { - return ( - promise ? promise->fc_profile( - num_rows_a, num_cols_a, num_rows_b, num_cols_b, - voltage_swing, patch_factor) - : std::make_pair(0.0, 0.0)); + return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b, + num_cols_b, voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); } std::pair<double, double> RuntimeController::conv_profile( @@ -588,17 +581,16 @@ std::pair<double, double> RuntimeController::conv_profile( const unsigned c_out, const unsigned c_in, const unsigned k_h, const unsigned k_w, const unsigned s_h, const unsigned s_w, const unsigned voltage_swing, const unsigned patch_factor) { - return ( - promise ? promise->conv_profile( - n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing, - patch_factor) - : std::make_pair(0.0, 0.0)); + return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w, + s_h, s_w, voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); } // Constructor and descructor RuntimeController::RuntimeController() { configurationIdx = 0; - FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10); + FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + 10); #ifdef ACTIVE_PROFILING PI = new ProfileInfo(); profiler = new Profiler(); @@ -674,7 +666,6 @@ void RuntimeController::readConfigurationFile(const char *str) { abort(); } - bool readingConfiguration = false; bool readingFirstLine = false; // Read baseline_time from first line of configuration file @@ -682,16 +673,14 @@ void RuntimeController::readConfigurationFile(const char *str) { std::getline(qin, first_line); DEBUG("first_line: %s\n", first_line.c_str()); - try{ + try { baseline_time = std::stod(first_line); DEBUG("Baseline time: %lf\n\n", baseline_time); - } - catch(...){ + } catch (...) { ERROR("Please Add/Fix Baseline Time at Top of Config File.. "); } - - unsigned int firstTensorID = 1; + unsigned int firstTensorID = 1; for (std::string line; std::getline(qin, line);) { DEBUG("line: %s\n", line.c_str()); @@ -709,13 +698,11 @@ void RuntimeController::readConfigurationFile(const char *str) { if (tokens[0] == "+++++") { // Found new configuration start token // Mark the start of a new configuration - readingConfiguration = true; readingFirstLine = true; continue; } if (tokens[0] == "-----") { // Found configuration end token - readingConfiguration = false; // Mark the end of current configuration continue; } @@ -724,10 +711,10 @@ void RuntimeController::readConfigurationFile(const char *str) { // Read first line, to create the new configuration struct readingFirstLine = false; firstTensorID = 1; // reset first tensor ID for new config - - InitialConfigurations.push_back(Configuration( - tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), - std::stof(tokens[3]), std::stof(tokens[4]))); + + InitialConfigurations.push_back( + Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), + std::stof(tokens[3]), std::stof(tokens[4]))); continue; } @@ -735,9 +722,8 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("Found gpu configuration\n"); // There must be at least one operation, with an approximation option - CUSTOM_ASSERT( - (tokens.size() >= 5) && - "Not enough operations - approximation options."); + CUSTOM_ASSERT((tokens.size() >= 5) && + "Not enough operations - approximation options."); GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -748,7 +734,7 @@ void RuntimeController::readConfigurationFile(const char *str) { InitialConfigurations.back().idConfigMap.insert( std::make_pair(firstTensorID, NodeConf)); DEBUG("*** firstTensorID = %d \n\n", firstTensorID); - + unsigned idx = 2; while (idx < tokens.size()) { if (tokens[idx] == "add") { @@ -897,14 +883,13 @@ void RuntimeController::readConfigurationFile(const char *str) { // Update first TensorID using number of tensor ops in current node firstTensorID += NodeConf->getApproxChoices().size(); - + } else if (tokens[1] == "cpu") { DEBUG("Found gpu configuration\n"); // There must be at least one operation, with an approximation option - CUSTOM_ASSERT( - (tokens.size() >= 5) && - "Not enough operations - approximation options."); + CUSTOM_ASSERT((tokens.size() >= 5) && + "Not enough operations - approximation options."); CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -1020,9 +1005,8 @@ void RuntimeController::computeParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort( - InitialConfigurations.begin() + 1, InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) { @@ -1056,14 +1040,12 @@ void RuntimeController::computeParetoConfigurationPoints() { en_idx = i; } } - DEBUG( - "accuracy loss = %f, speedup = %f, at sp_idx = %d\n", - InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); + DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n", + InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); // Found best speedup for this accuracy point (not dominated by any of // these). - DEBUG( - "accuracy loss = %f, energy = %f, at en_idx = %d\n", - InitialConfigurations[en_idx].accuracyLoss, en, en_idx); + DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n", + InitialConfigurations[en_idx].accuracyLoss, en, en_idx); // Found best energy for this accuracy point (not dominated by any of // these). @@ -1133,9 +1115,8 @@ void RuntimeController::compute3DParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort( - InitialConfigurations.begin(), InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort(InitialConfigurations.begin(), InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) { @@ -1169,11 +1150,10 @@ void RuntimeController::compute3DParetoConfigurationPoints() { } } if (!dominated) { - DEBUG( - "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", - InitialConfigurations[i].accuracyLoss, - InitialConfigurations[i].speedup, InitialConfigurations[i].energy, - i); + DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", + InitialConfigurations[i].accuracyLoss, + InitialConfigurations[i].speedup, InitialConfigurations[i].energy, + i); Indices.push_back(i); } } @@ -1232,31 +1212,22 @@ void RuntimeController::printConfigurations( } } -unsigned long RuntimeController::getLastFrequency() { - return g_freq; -} +unsigned long RuntimeController::getLastFrequency() { return g_freq; } -void RuntimeController::setLastFrequency(unsigned long f) { - g_freq = f; -} +void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; } -double RuntimeController::getLastSpeedup() { - return g_speedup; -} +double RuntimeController::getLastSpeedup() { return g_speedup; } -void RuntimeController::setLastSpeedup(double s) { - g_speedup = s; -} +void RuntimeController::setLastSpeedup(double s) { g_speedup = s; } void RuntimeController::findNextConfiguration() { configurationIdx = (configurationIdx + 1) % Configurations->size(); - DEBUG( - "findNextConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } -void RuntimeController::findTargetConfiguration( - float goal, enum SEARCH_KIND sk) { +void RuntimeController::findTargetConfiguration(float goal, + enum SEARCH_KIND sk) { // We search in range begin(), end()-1 . It is OK to decrement end(), because // the configurations vector always points to one of the pareto curves, and // they are never empty - we have always pushed at least one configuration. @@ -1267,25 +1238,25 @@ void RuntimeController::findTargetConfiguration( case SPEEDUP: { // Assigning one of Pareto configs to 'Configurations' class attribute Configurations = &SpeedupConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_SP()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_SP()); configurationIdx = low_it - Configurations->begin(); break; } case ENERGY: { Configurations = &EnergyConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_E()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_E()); configurationIdx = low_it - Configurations->begin(); break; } case ACCURACY_LOSS: { Configurations = &SpeedupConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_AL()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_AL()); if ((*low_it)->accuracyLoss > goal) --low_it; configurationIdx = low_it - Configurations->begin(); @@ -1300,9 +1271,8 @@ void RuntimeController::findTargetConfiguration( // After search, low_it points to the Configuration to the element with the // goal value or the immediately lower value if it does not exist - DEBUG( - "findTargetConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } void RuntimeController::adjustTargetConfiguration(float goal) { @@ -1313,8 +1283,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) { // Find configuration before the selected one. // There is always one, unless goal is 1. Then, we would pick baseline, and // both upper and lower should be the same configuration, at index 0. - unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1 - : configurationIdx; + unsigned prev_conf_idx = + configurationIdx > 0 ? configurationIdx - 1 : configurationIdx; // Get the two configurations' speedup, and compute the appropriate ranges float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup; @@ -1333,32 +1303,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- Probability adjustment strategy 1 ---***// // No big adjustments at edges of probability range -// float adjust_val = 0.0; -// if (low_pb < high_pb) { -// adjust_val = low_pb * 0.2; -// } else { -// adjust_val = high_pb * 0.2; -// } -// low_pb -= adjust_val; -// high_pb += adjust_val; + // float adjust_val = 0.0; + // if (low_pb < high_pb) { + // adjust_val = low_pb * 0.2; + // } else { + // adjust_val = high_pb * 0.2; + // } + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 2 ---***// // No big adjustment at high edge of probability range -// float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; -// low_pb -= adjust_val; -// high_pb += adjust_val; + // float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 3 ---***// - //Similar to 2, but higher always increases, more significantly -// float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; -// low_pb -= adjust_val; -// high_pb += adjust_val; + // Similar to 2, but higher always increases, more significantly + // float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 4 ---***// - //Similar to 2, but higher always increases, more significantly + // Similar to 2, but higher always increases, more significantly // Low end, high end a bit less aggressive than total range float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6; adjust_val = adjust_val > high_pb ? high_pb : adjust_val; @@ -1367,20 +1337,18 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- ---***// } - DEBUG( - "**---- adjustTargetConfiguration: upper conf = %s with probability: " - "%f.\n", - ((*Configurations)[configurationIdx]->name).c_str(), high_pb); - DEBUG( - "**---- adjustTargetConfiguration: lower conf = %s with probability: " - "%f.\n\n", - ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); + DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: " + "%f.\n", + ((*Configurations)[configurationIdx]->name).c_str(), high_pb); + DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: " + "%f.\n\n", + ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); // Select a random number from 0 to 1 // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) // to the upper // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ; - //float rd = pseudo_rd; + // float rd = pseudo_rd; float rd = distr(generator); if (rd < low_pb) { // If the probability is in the low range @@ -1414,8 +1382,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() { //*** Methods to compute accuracy of a tensor by the runtime controller ***// uint32_t *labels_from_file = NULL; -uint32_t * -hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { +uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, + int end) { // Initialize buffer if (!labels_from_file) { @@ -1424,14 +1392,14 @@ hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { ERROR("Data file %s is not found. Aborting...\n", labels_file); abort(); } - + // Get number of labels fseek(file, 0, SEEK_END); long size = ftell(file); fseek(file, 0, SEEK_SET); // return file pointer to beginning // Allocate memory for labels - labels_from_file = (uint32_t *) malloc(size); + labels_from_file = (uint32_t *)malloc(size); if (labels_from_file == NULL) { ERROR("Memory allocation for labels unsucessfull. Aborting...\n"); abort(); @@ -1488,10 +1456,10 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - - average_accuracy = accuracy + (average_accuracy * num_executations); + + average_accuracy = accuracy + (average_accuracy * num_executations); num_executations++; - average_accuracy = average_accuracy/num_executations; + average_accuracy = average_accuracy / num_executations; FILE *fp = fopen("final_accuracy", "w+"); if (fp != NULL) { @@ -1508,13 +1476,12 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { return accuracy; } - #define llvm_hpvm_invokeRtControl_BASE llvm_hpvm_invokeRtControl //#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl //#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl -extern "C" void llvm_hpvm_invokeRtControl_BASE( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1531,16 +1498,15 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ITERATE( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1564,16 +1530,15 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1616,17 +1581,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST( RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, + const char *str, int start, + int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1670,17 +1635,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR( RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result, + const char *str, int start, + int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1707,21 +1672,20 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN( float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO( - "Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO("Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result, + const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1749,21 +1713,19 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO( - "Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO("Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_RAND( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1781,9 +1743,8 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); @@ -1794,32 +1755,7 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) { std::ofstream of(path, std::ofstream::out | std::ofstream::app); if (!of.good()) ERROR("Cannot write to %s file", path); - for (float f: vec) + for (float f : vec) of << f << ' '; of << '\n'; } - -extern "C" void llvm_hpvm_imgInvokeRtControl(void* result, void *gold, int start, int end) { - RC->resume_profiler(); - - if (gold != nullptr) { - writeVectorToFile("psnr.txt", PSNR(gold, result)); - writeVectorToFile("ssim.txt", SSIM(gold, result)); - } - - // Read stats for iteration that was just completed - double current_iteration_time = RC->getCurrentIterationComputeTime(); - double current_iteration_energy = RC->getCurrentIterationComputeEnergy(); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationControlTime(pinfo.first); - RC->addToCurrentIterationControlEnergy(pinfo.second); - - INFO("current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); - - // Note the end of iteration - RC->end_iteration(); -} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu deleted file mode 100644 index 608950aa473948bc6c3663d88646c8080a5d56e1..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu +++ /dev/null @@ -1,638 +0,0 @@ -#include "approxhpvm_img_runtime_utils.h" -#include "debug.h" -#include "img_tensor_runtime.h" - -#include "functional/map.cuh" -#include "functional/reduce.cuh" -#include "tensor_utils.h" - -#include <cufft.h> -#include <cufftXt.h> -#include <thrust/device_vector.h> - -template <typename T> struct DivFunctor { - const T dividend; - - DivFunctor(float dividend) : dividend(dividend) {} - - __host__ __device__ T operator()(const T x) const { return x / dividend; } -}; - -// *** Runtime implementation *** // -void *tensorFft(void *input, bool inverse) { - // https://docs.nvidia.com/cuda/cufft/index.html#twod-complex-to-real-transforms - // Tensor checking - INFO("FFT\n"); - profileEvent("tensorFft"); - auto *t_input = (Tensor *)input; - int total_rank = t_input->dims.num_dims; - if (total_rank != 4) - ERROR("Only 4-dim tensor supported\n"); - // Dimensions - size_t *all_dims = t_input->dims.dim_sizes; - int height = all_dims[2], width = all_dims[3]; - int n_batch = int(all_dims[0]) * int(all_dims[1]); - // Prepare input data - hostToDeviceCopy(t_input); - // Create a 2D FFT plan - cufftHandle plan; - checkCUFFT(cufftCreate(&plan)); - // Output - Tensor *out_tensor = nullptr; - if (inverse) { - int fft_dim[2] = {height, (width - 1) * 2}; - auto *input_cuda = convertAndGetGPUData<cufftComplex>(t_input); - // Define output data - out_tensor = (Tensor *)create4DTensor( - (int)float_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height, - (width - 1) * 2); - changeTensorPlacement(out_tensor, DEVICE); - auto *output_cuda = convertAndGetGPUData<cufftReal>(out_tensor); - checkCUFFT(cufftPlanMany( - &plan, 2, fft_dim, nullptr, 1, 0, nullptr, 1, 0, CUFFT_C2R, n_batch)); - // Execute the plan - checkCUFFT(cufftExecC2R(plan, input_cuda, output_cuda)); - } else { - int fft_dim[2] = {height, width}; - auto *input_cuda = convertAndGetGPUData<cufftReal>(t_input); - // Define output data - out_tensor = (Tensor *)create4DTensor( - (int)float2_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height, - (width / 2 + 1)); - changeTensorPlacement(out_tensor, DEVICE); - auto *output_cuda = convertAndGetGPUData<cufftComplex>(out_tensor); - checkCUFFT(cufftPlanMany( - &plan, 2, fft_dim, nullptr, 1, 0, nullptr, 1, 0, CUFFT_R2C, n_batch)); - // Execute the plan - checkCUFFT(cufftExecR2C(plan, input_cuda, output_cuda)); - } - // Wait for the device to finish - checkCUDA(cudaDeviceSynchronize()); - - if (inverse) { - auto *output_cuda = convertAndGetGPUData<cufftReal>(out_tensor); - thrust::device_ptr<cufftReal> normalize_v(output_cuda); - size_t size = height * (width - 1) * 2; - DivFunctor<cufftReal> div(size); - thrust::transform(normalize_v, normalize_v + size, normalize_v, div); - } - // Release memory - cufftDestroy(plan); - profileEvent("tensorFft_end"); - return out_tensor; -} - -void *tensorFftHalf(void *input, bool inverse) { - // Tensor checking - INFO("FFTHalf\n"); - profileEvent("#tensorFft"); - auto *t_input = (Tensor *)input; - int total_rank = t_input->dims.num_dims; - if (total_rank != 4) - ERROR("Only 4-dim tensor supported\n"); - // Dimensions - size_t *all_dims = t_input->dims.dim_sizes; - int height = all_dims[2], width = all_dims[3]; - long long int n_batch = int(all_dims[0]) * int(all_dims[1]); - // Prepare input data - hostToDeviceCopy(t_input); - // Create a 2D FFT plan - cufftHandle plan; - checkCUFFT(cufftCreate(&plan)); - // Output - Tensor *out_tensor = nullptr; - if (inverse) { - long long int fft_dim[2] = {height, (width - 1) * 2}; - profileEvent("F2H_start"); - auto *input_cuda = convertAndGetGPUData<half2>(t_input); - profileEvent("F2H_end"); - // Define output data - out_tensor = (Tensor *)create4DTensor( - (int)half_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height, - (width - 1) * 2); - changeTensorPlacement(out_tensor, DEVICE); - auto *output_cuda = convertAndGetGPUData<half>(out_tensor); - size_t worksize = 0; - checkCUFFT(cufftXtMakePlanMany( - plan, 2, fft_dim, nullptr, 1, 0, CUDA_C_16F /*inputtype*/, nullptr, 1, - 0, CUDA_R_16F /*outputtype*/, n_batch, &worksize, - CUDA_C_16F /*executiontype*/ - )); - // Execute the plan - checkCUFFT(cufftXtExec(plan, input_cuda, output_cuda, CUFFT_INVERSE)); - } else { - long long int fft_dim[2] = {height, width}; - profileEvent("F2H_start"); - auto *input_cuda = convertAndGetGPUData<half>(t_input); - profileEvent("F2H_end"); - // Define output data - out_tensor = (Tensor *)create4DTensor( - (int)half2_type, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], height, - (width / 2 + 1)); - changeTensorPlacement(out_tensor, DEVICE); - auto *output_cuda = convertAndGetGPUData<half2>(out_tensor); - size_t worksize = 0; - checkCUFFT(cufftXtMakePlanMany( - plan, 2, fft_dim, nullptr, 1, 0, CUDA_R_16F /*inputtype*/, nullptr, 1, - 0, CUDA_C_16F /*outputtype*/, n_batch, &worksize, - CUDA_C_16F /*executiontype*/ - )); - // Execute the plan - checkCUFFT(cufftXtExec(plan, input_cuda, output_cuda, CUFFT_FORWARD)); - } - // Wait for the device to finish - checkCUDA(cudaDeviceSynchronize()); - - if (inverse) { - auto *output_cuda = convertAndGetGPUData<half>(out_tensor); - thrust::device_ptr<half> normalize_v(output_cuda); - size_t size = height * (width - 1) * 2; - DivFunctor<half> div(size); - thrust::transform(normalize_v, normalize_v + size, normalize_v, div); - - profileEvent("H2F_start"); - convertToFP32_offline(out_tensor); - out_tensor->data_type = out_tensor->cur_type; - profileEvent("H2F_end"); - } else { - profileEvent("H2F_start"); - convertToFloat2Offline(out_tensor); - profileEvent("H2F_end"); - } - // Release memory - cufftDestroy(plan); - - profileEvent("#tensorFft_end"); - return out_tensor; -} - -/* Implements template instantiations in reduce.cuh */ - -template <> -__device__ void reduceAlongDim<float>( - float *target, float *src, float init, float identity, void *func, - size_t num_irows, size_t dim_size) { - auto *binary_op = (NTo1MapF<float, 2>)func; - - float acc = init; - for (size_t col = 0; col < dim_size; ++col) { - acc = binary_op(acc, *src); - src += num_irows; - } - *target = acc; -} - -template <> -__device__ void parallelReduceAlongDim<float>( - float *target, float *src, float init, float identity, void *func, - size_t num_irows, size_t dim_size, size_t along_dim_tid, - size_t n_along_dim_threads) { - __shared__ float sbuf[CrossDimTh][AlongDimTh + 1]; // avoid bank conflict - float *this_buf_line = sbuf[threadIdx.y]; - - auto *binary_op = (NTo1MapF<float, 2>)func; - - float acc = init; - // Sequential reduction within a thread. - for (size_t col = along_dim_tid; col < dim_size; col += n_along_dim_threads) { - acc = binary_op(acc, src[col * num_irows]); - } - - this_buf_line[along_dim_tid] = acc; - - __syncthreads(); - - // Reduce intermediate values to single value. - for (size_t s = AlongDimTh >> 1u; s > 0; s >>= 1u) { - if (along_dim_tid < s) { - float arg1 = this_buf_line[along_dim_tid]; - float arg2 = this_buf_line[along_dim_tid + s]; - float res = binary_op(arg1, arg2); - this_buf_line[along_dim_tid] = res; - } - __syncthreads(); - } - - if (along_dim_tid == 0) { - *target = this_buf_line[0]; - } - __syncthreads(); -} - -static __device__ __forceinline__ half -reduceHalf2ToHalf(NTo1MapF<half2, 2> half2_f, half2 value) { - half2 high = __high2half2(value), low = __low2half2(value); - return __high2half(half2_f(high, low)); // Or the lower half, whatever -} - -template <> -__device__ void reduceAlongDim<half>( - half *target, half *src, half init, half identity, void *func, - size_t num_irows, size_t dim_size) { - auto *binary_op = (NTo1MapF<half2, 2>)func; - - half2 acc = __halves2half2(init, identity); - size_t twice_irows = num_irows << 1; - for (size_t col = 0; col < dim_size; col += 2) { - half higher = col + 1 < dim_size ? *(src + num_irows) : identity; - acc = binary_op(acc, __halves2half2(*src, higher)); - src += twice_irows; - } - *target = reduceHalf2ToHalf(binary_op, acc); -} - -template <> -__device__ void parallelReduceAlongDim<half>( - half *target, half *src, half init, half identity, void *func, - size_t num_irows, size_t dim_size, size_t along_dim_tid, - size_t n_along_dim_threads) { - __shared__ half2 sbuf[CrossDimTh][AlongDimTh + 1]; // avoid bank conflict - half2 *this_buf_line = sbuf[threadIdx.y]; - - auto *binary_op = (NTo1MapF<half2, 2>)func; - - // Sequential reduction within a thread. - half2 acc = __halves2half2(init, identity); - size_t src_stride = n_along_dim_threads * num_irows; - for (size_t col = along_dim_tid; col < dim_size; - col += (n_along_dim_threads << 1), src += src_stride << 1) { - half higher = - col + n_along_dim_threads < dim_size ? *(src + src_stride) : identity; - acc = binary_op(acc, __halves2half2(*src, higher)); - } - - this_buf_line[along_dim_tid] = acc; - - __syncthreads(); - - // Reduce intermediate values to single value. - for (size_t s = AlongDimTh >> 1u; s > 0; s >>= 1u) { - if (along_dim_tid < s) { - half2 arg1 = this_buf_line[along_dim_tid]; - half2 arg2 = this_buf_line[along_dim_tid + s]; - half2 res = binary_op(arg1, arg2); - this_buf_line[along_dim_tid] = res; - } - __syncthreads(); - } - - if (along_dim_tid == 0) { - *target = reduceHalf2ToHalf(binary_op, this_buf_line[0]); - } - __syncthreads(); -} - -template <> -__global__ void kernelMapBroadcast<float, 1>( - float *target, unsigned num_rows, void *func, float **srcs, - size_t *tail_strides) { - auto *n_ary_op = (NTo1MapF<float, 1>)func; - - unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x, - stride = gridDim.x * blockDim.x; - for (unsigned row = threadId; row < num_rows; row += stride) { - target[row] = n_ary_op(srcs[0][row]); - } -} - -template <> -__global__ void kernelMapBroadcast<float, 2>( - float *target, unsigned num_rows, void *func, float **srcs, - size_t *tail_strides) { - auto *n_ary_op = (NTo1MapF<float, 2>)func; - - unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x, - stride = gridDim.x * blockDim.x; - for (unsigned row = threadId; row < num_rows; row += stride) { - unsigned j0 = row / tail_strides[0], j1 = row / tail_strides[1]; - target[row] = n_ary_op(srcs[0][j0], srcs[1][j1]); - } -} - -template <> -__global__ void kernelMapBroadcast<float, 3>( - float *target, unsigned num_rows, void *func, float **srcs, - size_t *tail_strides) { - auto *n_ary_op = (NTo1MapF<float, 3>)func; - - unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x, - stride = gridDim.x * blockDim.x; - for (unsigned row = threadId; row < num_rows; row += stride) { - unsigned j0 = row / tail_strides[0], j1 = row / tail_strides[1], - j2 = row / tail_strides[2]; - target[row] = n_ary_op(srcs[0][j0], srcs[1][j1], srcs[1][j2]); - } -} - -template <> -__global__ void kernelMapBroadcast<half, 1>( - half *target, unsigned num_rows, void *func, half **srcs, - size_t *tail_strides) { - auto *op = (NTo1MapF<half2, 1>)func; - - unsigned threadId = (blockIdx.x * blockDim.x + threadIdx.x) << 1, - stride = (gridDim.x * blockDim.x) << 1; - unsigned row = threadId; - for (; row < num_rows - 1; row += stride) { - auto *m0 = (half2 *)&srcs[0][row], *m_out = (half2 *)&target[row]; - *m_out = op(*m0); - } - if (row == num_rows - 1) { - half2 result = op(__half2half2(srcs[0][row])); - target[row] = __high2half(result); - } -} - -template <> -__global__ void kernelMapBroadcast<half, 2>( - half *target, unsigned num_rows, void *func, half **srcs, - size_t *tail_strides) { - auto *op = (NTo1MapF<half2, 2>)func; - - unsigned o_row = (blockIdx.x * blockDim.x + threadIdx.x) << 1, - o_stride = gridDim.x * blockDim.x, o_stride2 = o_stride << 1; - for (; o_row < num_rows - 1; o_row += o_stride2) { - half2 *o_ptr = (half2 *)&target[o_row]; - half2 in[2]; - for (size_t i = 0; i < 2; i++) { - if (tail_strides[i] == 1) { - in[i] = __halves2half2(srcs[i][o_row], srcs[i][o_row + 1]); - } else { - unsigned i_row_l = o_row / tail_strides[i], - i_row_r = (o_row + 1) / tail_strides[i]; - if (i_row_l == i_row_r) - in[i] = __half2half2(srcs[i][i_row_l]); - else - in[i] = __halves2half2(srcs[i][i_row_l], srcs[i][i_row_r]); - } - } - *o_ptr = op(in[0], in[1]); - } - if (o_row == num_rows - 1) { - unsigned row0 = o_row / tail_strides[0], row1 = o_row / tail_strides[1]; - half2 v0 = __half2half2(srcs[0][row0]), v1 = __half2half2(srcs[1][row1]); - half2 result = op(v0, v1); - target[o_row] = __high2half(result); - } -} - -template <> -__global__ void kernelMapBroadcast<half, 3>( - half *target, unsigned num_rows, void *func, half **srcs, - size_t *tail_strides) { - auto *op = (NTo1MapF<half2, 3>)func; - - unsigned threadId = (blockIdx.x * blockDim.x + threadIdx.x) << 1, - stride = (gridDim.x * blockDim.x) << 1; - unsigned row = threadId; - for (; row < num_rows - 1; row += stride) { - unsigned row0 = row / tail_strides[0], row1 = row / tail_strides[1], - row2 = row / tail_strides[2]; - auto *m0 = (half2 *)&srcs[0][row0], *m1 = (half2 *)&srcs[1][row1], - *m2 = (half2 *)&srcs[2][row2], *m_out = (half2 *)&target[row]; - *m_out = op(*m0, *m1, *m2); - } - if (row == num_rows - 1) { - unsigned row0 = row / tail_strides[0], row1 = row / tail_strides[1], - row2 = row / tail_strides[2]; - half2 v0 = __half2half2(srcs[0][row0]), v1 = __half2half2(srcs[1][row1]), - v2 = __half2half2(srcs[2][row2]); - half2 result = op(v0, v1, v2); - target[row] = __high2half(result); - } -} - -void *tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio) { - INFO("Reduce\n"); - profileEvent("tensorReduce"); - auto *src = (Tensor *)input; - if (axis >= src->dims.num_dims) - ERROR("Dimension out of range\n"); - if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW) - ERROR("Not supported\n"); - Tensor *ret = reduceDim<float>(src, 0.0f, func, axis, skip_ratio); - profileEvent("tensorReduce_end"); - return ret; -} - -void * -tensorReduceHalf(void *input, size_t axis, MathOp func, float skip_ratio) { - INFO("Reduce\n"); - profileEvent("#tensorReduce"); - auto *src = (Tensor *)input; - if (axis >= src->dims.num_dims) - ERROR("Dimension out of range\n"); - if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW) - ERROR("Not supported\n"); - Tensor *ret = reduceDim<half>(src, 0.0f, func, axis, skip_ratio); - profileEvent("H2F_start"); - convertToFP32_offline(ret); - profileEvent("H2F_end"); - profileEvent("#tensorReduce_end"); - return ret; -} - -void *tensorProjectiveT(void *input, void *transformation) { - ERROR("ProjectiveT operation currently unsupported.\n"); - abort(); -} - -void *tensorMap1(MathOp f, void *i) { - INFO("Map1\n"); - profileEvent("tensorMap1"); - auto *src = (Tensor *)i; - Tensor *ret = mapGeneral<float, 1>(f, {src}); - profileEvent("tensorMap1_end"); - return ret; -} - -void *tensorMap1Half(MathOp f, void *i) { - INFO("Map1Half\n"); - profileEvent("#tensorMap1"); - auto *src = (Tensor *)i; - Tensor *ret = mapGeneral<half, 1>(f, {src}); - profileEvent("H2F_start"); - convertToFP32_offline(ret); - profileEvent("H2F_end"); - profileEvent("#tensorMap1_end"); - return ret; -} - -void *tensorMap2(MathOp f2, void *i1, void *i2) { - INFO("Map2\n"); - profileEvent("tensorMap2"); - auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2; - Tensor_type_t common_ty = - getCompatibleType(src1->cur_type, src2->cur_type, false); - Tensor *ret = nullptr; - if (common_ty == float_type) - ret = mapGeneral<float, 2>(f2, {src1, src2}); - else if (common_ty == float2_type) - ret = mapGeneral<float2, 2>(f2, {src1, src2}); - else - ERROR("Type not recognized\n"); - profileEvent("tensorMap2_end"); - return ret; -} - -void *tensorMap2Half(MathOp f2, void *i1, void *i2) { - INFO("Map2Half\n"); - profileEvent("#tensorMap2"); - auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2; - Tensor_type_t common_ty = - getCompatibleType(src1->cur_type, src2->cur_type, false); - if (common_ty == float_type) { - Tensor *ret = mapGeneral<half, 2>(f2, {src1, src2}); - profileEvent("H2F_start"); - convertToFP32_offline(ret); - profileEvent("H2F_end"); - profileEvent("#tensorMap2_end"); - return ret; - } else if (common_ty == float2_type) { - Tensor *ret = mapGeneral<half2, 2>(f2, {src1, src2}); - profileEvent("H2F_start"); - convertToFloat2Offline(ret); - profileEvent("H2F_end"); - profileEvent("#tensorMap2_end"); - return ret; - } else { - ERROR("Type not recognized\n"); - return nullptr; // For some compilers - } -} - -void *tensorMap3(MathOp f3, void *i1, void *i2, void *i3) { - INFO("Map3\n"); - profileEvent("tensorMap3"); - auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3; - Tensor *ret = mapGeneral<float, 3>(f3, {src1, src2, src3}); - profileEvent("tensorMap3_end"); - return ret; -} - -void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3) { - INFO("Map3Half\n"); - profileEvent("#tensorMap3"); - auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3; - Tensor *ret = mapGeneral<half, 3>(f3, {src1, src2, src3}); - profileEvent("H2F_start"); - convertToFP32_offline(ret); - profileEvent("H2F_end"); - profileEvent("#tensorMap3_end"); - return ret; -} - -// *** Wrapper API implementation *** // - -void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a fft operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::FFT && - "Invalid configuration generated for tensor fft wrapper operation"); - return handleTensorFftApproximationTuples( - ApproxChoices[0].second, input, inverse); -} - -void *wrapper_tensorReduce( - const char *hpvm_node_id, void *input, int axis, int func) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a reduce operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::REDUCE && - "Invalid configuration generated for tensor reduce wrapper operation"); - return handleTensorReduceApproximationTuples( - ApproxChoices[0].second, input, axis, (MathOp)func); -} - -void *wrapper_tensorProjectiveT( - const char *hpvm_node_id, void *input, void *transformation) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a projectiveT operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T && - "Invalid configuration generated for tensor projectiveT " - "wrapper operation"); - return handleTensorProjectiveTApproximationTuples( - ApproxChoices[0].second, input, transformation); -} - -void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a map1 operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP1 && - "Invalid configuration generated for tensor map1 wrapper operation"); - return handleTensorMap1ApproximationTuples( - ApproxChoices[0].second, (MathOp)func, input); -} - -void *wrapper_tensorMap2( - const char *hpvm_node_id, int func, void *input1, void *input2) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a map2 operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP2 && - "Invalid configuration generated for tensor map2 wrapper operation"); - return handleTensorMap2ApproximationTuples( - ApproxChoices[0].second, (MathOp)func, input1, input2); -} - -void *wrapper_tensorMap3( - const char *hpvm_node_id, int func, void *input1, void *input2, - void *input3) { - GPUNodeConfiguration *GPUConf = - (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector<std::pair< - GPUNodeConfiguration::TENSOR_OP, - std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> - &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a map3 operation - CUSTOM_ASSERT( - ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP3 && - "Invalid configuration generated for tensor map3 wrapper operation"); - return handleTensorMap3ApproximationTuples( - ApproxChoices[0].second, (MathOp)func, input1, input2, input3); -} - -// Tentative -void *wrapper_tensorStencil(const char *hpvm_node_id, void *input) { - ERROR("Stencil operation currently unsupported.\n"); - abort(); -} - -void *wrapper_tensorCosineT(const char *hpvm_node_id, void *input) { - ERROR("CosineT operation currently unsupported.\n"); - abort(); -} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp deleted file mode 100644 index b4e9e3fea8a2f0638267f6386698d5434a6b91fc..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp +++ /dev/null @@ -1,445 +0,0 @@ -#include <algorithm> -#include <cmath> -#include <cstring> -#include <experimental/filesystem> -#include <numeric> -#include <sstream> -#include <string> - -#include "debug.h" -#include "device_math.h" -#include "functional/common.h" -#include "img_tensor_runtime.h" -#include "img_tensor_utils.h" -#include "tensor_utils.h" - -// Image I/O utilities -#define STB_IMAGE_IMPLEMENTATION -#define STB_IMAGE_WRITE_IMPLEMENTATION - -#include "image/stb_image.h" -#include "image/stb_image_write.h" - -static inline uint8_t *float_to_uint8(const float *fl, size_t count) { - auto *ret = new uint8_t[count]; - float max_v = *std::max_element(fl, fl + count), - min_v = *std::min_element(fl, fl + count); - if (max_v - min_v < 1e-3) { - for (size_t i = 0; i < count; i++) - ret[i] = 0; - } else { - float frac = 255 / (max_v - min_v); - for (size_t i = 0; i < count; i++) - ret[i] = uint8_t(frac * (fl[i] - min_v)); - } - return ret; -} - -static inline float *uint8_to_float(const uint8_t *ui, size_t len) { - auto *ret = new float[len]; - for (size_t i = 0; i < len; i++) - ret[i] = float(ui[i]) / 255; - return ret; -} - -static Tensor *to_nhwc(Tensor *t) { - if (t->data_format == CUDNN_TENSOR_NHWC) { - DEBUG("Tensor already in NHWC format, no conversion needed\n"); - return t; - } else if (t->data_format != CUDNN_TENSOR_NCHW) { - ERROR("Unknown tensor format: %s\n", std::to_string(t->data_format)); - } else { - DEBUG("Converting to NHWC format\n"); - } - - size_t *dim_arr = t->dims.dim_sizes; - size_t n = dim_arr[0], c = dim_arr[1], h = dim_arr[2], w = dim_arr[3]; - auto *out_tensor = - (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NHWC, n, h, w, c); - size_t nhwc_offset = 0; - size_t element_size = getTypeSize(t->data_type); - char *out_data = (char *)(out_tensor->host_data), - *in_data = (char *)(t->host_data); - for (int n0 = 0; n0 < n; n0++) - for (int h0 = 0; h0 < h; h0++) - for (int w0 = 0; w0 < w; w0++) - for (int c0 = 0; c0 < c; c0++) { - size_t nc = n0 * c + c0, nch = nc * h + h0, nchw_idx = nch * w + w0, - nchw_offset = nchw_idx * element_size; - std::memcpy(out_data + nhwc_offset, in_data + nchw_offset, - element_size); - nhwc_offset += element_size; - } - return out_tensor; -} - -static Tensor *to_nchw(Tensor *t) { - if (t->data_format == CUDNN_TENSOR_NCHW) { - DEBUG("Tensor already in NCHW format, no conversion needed\n"); - return t; - } else if (t->data_format != CUDNN_TENSOR_NHWC) { - ERROR("Unknown tensor format: %s\n", std::to_string(t->data_format)); - } else { - DEBUG("Converting to NCHW format\n"); - } - size_t *dim_arr = t->dims.dim_sizes; - size_t n = dim_arr[0], h = dim_arr[1], w = dim_arr[2], c = dim_arr[3]; - Tensor *out_tensor = - (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); - size_t nchw_offset = 0; - size_t element_size = getTypeSize(t->data_type); - char *out_data = (char *)(out_tensor->host_data), - *in_data = (char *)(t->host_data); - for (int n0 = 0; n0 < n; n0++) - for (int c0 = 0; c0 < c; c0++) - for (int h0 = 0; h0 < h; h0++) - for (int w0 = 0; w0 < w; w0++) { - size_t nh = n0 * h + h0, nhw = nh * w + w0, nhwc_idx = nhw * c + c0, - nhwc_offset = nhwc_idx * element_size; - std::memcpy(out_data + nchw_offset, in_data + nhwc_offset, - element_size); - nchw_offset += element_size; - } - return out_tensor; -} - -namespace fs = std::experimental::filesystem; - -// List all files in a folder. -static inline std::vector<std::string> listFiles(const std::string &folder) { - std::vector<std::string> ret; - for (const auto &entry : fs::directory_iterator(folder)) - ret.push_back(entry.path().string()); - std::sort(ret.begin(), ret.end()); - return ret; -} - -// return in[start:start+count] -template <typename T> -std::vector<T> sliceVector(const std::vector<T> &in, size_t start, - size_t count) { - auto slice_begin = in.begin() + start; - if (slice_begin > in.end()) - slice_begin = in.end(); - auto slice_end = count == std::string::npos ? in.end() : slice_begin + count; - if (slice_end > in.end()) - slice_end = in.end(); - return std::vector<T>(slice_begin, slice_end); -} - -// Read an image dataset from a folder with each image as a file. -Tensor *readDataSet(const char *path, size_t start, size_t count, - size_t n_color) { - INFO("Loading image dataset from path %s\n", path); - std::vector<std::string> filenames = - sliceVector(listFiles(path), start, count); - if (filenames.empty()) { - INFO("Folder is empty or slice is empty\n"); - return nullptr; - } - - auto *first_image = (Tensor *)loadAsImage(filenames[0].c_str(), n_color); - std::vector<size_t> sizes = ::sizes(first_image); - size_t h = sizes[2], w = sizes[3]; - DEBUG("Loading shape: (%lu, %lu, %lu, %lu)\n", filenames.size(), n_color, h, - w); - auto *batch = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, - filenames.size(), h, w, n_color); - size_t n_floats = n_color * h * w; - auto *base_data = (float *)batch->host_data; - for (const auto &path : filenames) { - int x, y, n; // x = width, y = height, n = # 8-bit components per pixel - uint8_t *data = stbi_load(path.c_str(), &x, &y, &n, n_color); - if (data == nullptr) - ERROR("Image load failed\n"); - if (x != h || y != w) { - std::ostringstream os; - os << "Image file " << path << " have different shape (" << x << ", " << y - << ")"; - ERROR("%s\n", os.str()); - } - float *converted = uint8_to_float(data, n_floats); - stbi_image_free(data); - std::memcpy(base_data, converted, n_floats * sizeof(float)); - delete[] converted; - base_data += n_floats; - } - auto *nchw_batch = to_nchw(batch); - DEBUG("Loaded all images.\n"); - return nchw_batch; -} - -// Convert complex-domain image to float valued image. -static Tensor *complexToFloat(Tensor *batch) { - convertAndGetGPUData<float2>(batch); // Convert to float2 - deviceToHostCopy(batch); - auto *in_data = (float2 *)batch->host_data; - size_t n_elem = batch->num_elems; - std::vector<float> magnitudes(n_elem, 0.0f); - for (size_t i = 0; i < batch->num_elems; i++) { - magnitudes[i] = hypot(in_data[i].x, in_data[i].y); - } - - size_t *dims = batch->dims.dim_sizes; - auto *ret = (Tensor *)create4DTensor(float_type, batch->data_format, dims[0], - dims[1], dims[2], dims[3]); - auto *out_data = (float *)ret->host_data; - for (size_t i = 0; i < magnitudes.size(); i++) { - float f = magnitudes[i]; - out_data[i] = f > 1.0f ? log(f) : 0; - } - return ret; -} - -// Save an image tensor image-by-image to a folder. -void saveDataSet(const char *path, Tensor *batch, size_t start_idx, - size_t write_n) { - INFO("Saving image dataset to path %s\n", path); - Tensor *float_batch = batch; - if (batch->data_type == float2_type || batch->data_type == half2_type) - float_batch = complexToFloat(float_batch); // Already copied - else { - DEBUG("Copying to CPU before printing\n"); - convertAndGetGPUData<float>(float_batch); - deviceToHostCopy(float_batch); - } - Tensor *converted_batch = float_batch; - if (converted_batch->data_format == CUDNN_TENSOR_NCHW) { - DEBUG("Copy-converting to NHWC format\n"); - converted_batch = to_nhwc(converted_batch); - } - std::vector<size_t> sizes = ::sizes(converted_batch); - size_t h = sizes[1], w = sizes[2], c = sizes[3]; - auto *base_data = (float *)converted_batch->host_data; - if (write_n == 0) - write_n = sizes[0]; - else - write_n = std::min(write_n, sizes[0]); - for (size_t i = start_idx; i < start_idx + write_n; i++) { - std::string name = path; - name += "/"; - std::string number = std::to_string(i); - // FIXME: pad to 6 digits. Ordering will break when we have more than 1M - // files. - number = std::string(6 - number.length(), '0') + number; - name += number + ".png"; - - uint8_t *ldr_data = float_to_uint8(base_data, h * w * c); - if (!stbi_write_png(name.c_str(), w, h, c, ldr_data, 0)) - ERROR("Write file failed\n"); - delete[] ldr_data; - - base_data += h * w * c; - } -} - -// Load 1 file as an image into a tensor. -void *loadAsImage(const char *filename, size_t n_color) { - INFO("Loading image from path=%s\n", filename); - int x, y, n; // x = width, y = height, n = # 8-bit components per pixel - uint8_t *data = stbi_load(filename, &x, &y, &n, n_color); - if (data == nullptr) - ERROR("Image load failed\n"); - float *converted = uint8_to_float(data, x * y * n); - DEBUG("Loading shape: (1, %lu, %lu, %lu)(NHWC)\n", y, x, n_color); - auto *image = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, 1, y, x, n); - std::memcpy(image->host_data, converted, x * y * n * sizeof(float)); - auto *nchw_image = to_nchw(image); - stbi_image_free(data); - return nchw_image; -} - -// Save 1 tensor as an image into a file. -void saveToImage(const char *filename, Tensor *tensor) { - INFO("Saving image data to path=%s\n", filename); - deviceToHostCopy(tensor); - Tensor *converted_tensor = tensor; - if (tensor->data_format == CUDNN_TENSOR_NCHW) { - DEBUG("Copy-converting to NHWC format\n"); - converted_tensor = to_nhwc(tensor); - } - auto *hdr_data = (float *)converted_tensor->host_data; - size_t *dims = converted_tensor->dims.dim_sizes; - size_t w = dims[2], h = dims[1], c = dims[3]; - uint8_t *ldr = float_to_uint8(hdr_data, w * h * c); - stbi_write_png(filename, w, h, c, ldr, 0); - delete[] ldr; -} - -// Make a conv2d filter from 2-dim data. -void *createFilterFromData(int data_type, void *data, size_t w, size_t h, - size_t n_chan) { - DEBUG("Creating filter from data\n"); - auto *tensor = - (Tensor *)create4DTensor(data_type, CUDNN_TENSOR_NCHW, n_chan, 1, h, w); - char *tensor_data; - if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_FLOAT) - tensor_data = (char *)tensor->host_data; - else { - ERROR("Data type unsupported as filter\n"); - } - size_t channel_sz = tensor->size_in_bytes / n_chan; - for (size_t i = 0; i < n_chan; i++, tensor_data += channel_sz) { - std::memcpy(tensor_data, data, channel_sz); - } - return tensor; -} - -// Normalize an image tensor. -static void *normalize(void *image) { - auto *max_1D = tensorReduce(image, 2, MathOp::Max); - auto *max = tensorReduce(max_1D, 3, MathOp::Max); - auto *img_norm = tensorMap2(MathOp::Div, image, max); - freeTensor(max_1D); - freeTensor(max); - return img_norm; -} - -float compute_mean(float *arr, int left, int right) { - float sum = 0; - for (int i = left; i < right; i++) { - sum += arr[i]; - } - return sum / (right - left); -} - -float compute_variance(float *arr, int left, int right, float mean) { - float sum = 0; - for (int i = left; i < right; i++) { - sum += (arr[i] - mean) * (arr[i] - mean); - } - return sum / (right - left - 1); -} - -float compute_covariance(float *x, float *y, int left, int right, float x_mean, - float y_mean) { - float sum = 0; - for (int i = left; i < right; i++) { - sum += (x[i] - x_mean) * (y[i] - y_mean); - } - return sum / (right - left - 1); -} - -std::vector<float> SSIM(void *lhs_ptr, void *rhs_ptr) { - auto *lhs = (Tensor *)lhs_ptr, *rhs = (Tensor *)rhs_ptr; - - lhs = (Tensor *)normalize(lhs); - rhs = (Tensor *)normalize(rhs); - - convertToFP32(lhs); - convertToFP32(rhs); - deviceToHostCopy(lhs); - deviceToHostCopy(rhs); - - float L = 1.0; - float K1 = 0.01; - float K2 = 0.03; - float C1 = (K1 * L) * (K1 * L); - float C2 = (K2 * L) * (K2 * L); - - int n = lhs->dims.dim_sizes[0]; - int c = lhs->dims.dim_sizes[1]; - int h = lhs->dims.dim_sizes[2]; - int w = lhs->dims.dim_sizes[3]; - - float *lhs_arr = (float *)lhs->host_data; - float *rhs_arr = (float *)rhs->host_data; - - std::vector<float> scores; - for (int i = 0; i < n; i++) { - int left = i * c * h * w; - int right = (i + 1) * c * h * w; - - float x_mean = compute_mean(lhs_arr, left, right); - float y_mean = compute_mean(rhs_arr, left, right); - float x_var = compute_variance(lhs_arr, left, right, x_mean); - float y_var = compute_variance(rhs_arr, left, right, y_mean); - float covariance = - compute_covariance(lhs_arr, rhs_arr, left, right, x_mean, y_mean); - - scores.push_back( - ((2 * x_mean * y_mean + C1) * (2 * covariance + C2)) / - ((x_mean * x_mean + y_mean * y_mean + C1) * (x_var + y_var + C2))); - } - return scores; -} - -std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) { - auto *gold_tensor = (Tensor *)gold_ptr, *approx_tensor = (Tensor *)approx_ptr; - convertToFP32(gold_tensor); - convertToFP32(approx_tensor); - - size_t *dim_sizes = gold_tensor->dims.dim_sizes; - size_t batch_dim = dim_sizes[0]; - size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - float image_size_f = image_size; - DEBUG("batch_dim = %lu, image_size = %lu\n", batch_dim, image_size); - auto *image_size_tensor = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1); - std::memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float)); - - gold_tensor = (Tensor *)normalize(gold_tensor); - approx_tensor = (Tensor *)normalize(approx_tensor); - auto *diff = tensorMap2(MathOp::Sub, gold_tensor, approx_tensor); - auto *diffsqr = tensorMap2(MathOp::Mul, diff, diff); - auto *mse_sum_2d = tensorReduce(diffsqr, 3, MathOp::Add); - auto *mse_sum_1d = tensorReduce(mse_sum_2d, 2, MathOp::Add); - auto *mse_sum = tensorReduce(mse_sum_1d, 1, MathOp::Add); - auto *mse_avg = tensorMap2(MathOp::Div, mse_sum, image_size_tensor); - auto *psnr_val = (Tensor *)tensorMap1(MathOp::PSNR, mse_avg); - deviceToHostCopy(psnr_val); - - auto *float_data = (float *)psnr_val->host_data; - return std::vector<float>(float_data, float_data + batch_dim); -} - -float violationRate(const std::vector<float> &values, float threshold, - bool higher_better) { - if (values.empty()) - return 0.0f; - size_t violation = 0; - for (float v : values) { - if (std::isnan(v)) - ++violation; - if ((higher_better && v < threshold) || (!higher_better && v > threshold)) - ++violation; - } - return (float)violation / values.size(); -} - -float mean(const std::vector<float> &values) { - std::vector<float> non_nan; - for (float f : values) - if (!std::isnan(f)) - non_nan.push_back(f); - if (non_nan.empty()) - return 0.0f; - return std::accumulate(non_nan.begin(), non_nan.end(), 0.0f, std::plus<>()) / - (float)non_nan.size(); -} - -void *sliceTensorInBatch(void *whole, size_t start, size_t end) { - auto *whole_tensor = (Tensor *)whole; - size_t *dim_sizes = whole_tensor->dims.dim_sizes; - auto *output = - (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, end - start, - dim_sizes[1], dim_sizes[2], dim_sizes[3]); - size_t single_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - auto *in_data = (float *)(whole_tensor->host_data) + start * single_size; - memcpy(output->host_data, in_data, (end - start) * single_size); - return output; -} - -void reshape(void *t, const std::vector<size_t> &shape) { - auto *tensor = (Tensor *)t; - size_t in_n = num_elems(tensor), out_n = num_elems(shape); - if (in_n != out_n) - ERROR("Reshaping cannot change number of elements\n"); - tensor->dims.num_dims = shape.size(); - free(tensor->dims.dim_sizes); - tensor->dims.dim_sizes = (size_t *)malloc(sizeof(size_t) * shape.size()); - std::copy(shape.begin(), shape.end(), tensor->dims.dim_sizes); - set4DTensorDescriptor(tensor, tensor->data_format, shape[0], shape[1], - shape[2], shape[3]); -} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc index 8b5c4aaf93db40c038c4a9a30569318ae00d6be1..b322ee2be37b60487e15c9109d4230adf1ad84e2 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc @@ -50,10 +50,6 @@ void llvm_hpvm_initTensorRt(int gpuid) { #endif -#ifdef ERROR_INJECTION_ENABLED - readOpenTunerFlags("opentuner_flags"); -#endif - runtime_initialized = true; } @@ -72,14 +68,7 @@ void llvm_hpvm_initApproxhpvmRt(int gpuid) { void llvm_hpvm_cleanupApproxhpvmRt() {} -void dumpAccuracyNorms() { - -#ifdef ERROR_INJECTION_ENABLED - -#endif - - dump_result("accuracy_summary"); -} +void dumpAccuracyNorms() { dump_result("accuracy_summary"); } // Returns the number of GPUs active on the platform unsigned int getGPUCount() { diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc index ad1d2e137d19d1c158afb031f35f278d9cdefaa0..08f13bf0f891e03f3d13e0c2f2e8bc97bacb3b64 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc @@ -1,13 +1,12 @@ //===----------------------------- profling.cc ---------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file contains code provides the definition of the interface for // applications to start and stop profiling for energy and performance. // //===----------------------------------------------------------------------===// - #ifndef PROFILING_HEADER #define PROFILING_HEADER @@ -52,7 +51,7 @@ void stopProfiling() { void profileEvent(const char *event_name, bool compare_previous = false) { checkCudaErrors(cudaDeviceSynchronize()); - + auto it = func_counters.find(event_name); if (it == func_counters.end()) { func_counters[event_name] = 1; @@ -73,7 +72,7 @@ void profileEvent(const char *event_name, bool compare_previous = false) { time_reading - zero_time; DEBUG("AbsoluteTime, Event = %s, Time = %f \n", event_name, - current_time.count()); + current_time.count()); profile_data.append(event_name); profile_data.append(event_count); profile_data.append("\t"); @@ -86,14 +85,13 @@ void profileEvent(const char *event_name, bool compare_previous = false) { profile_data.append("\t"); profile_data.append(std::to_string(duration_time.count())); DEBUG("TimeDuration, Event = %s, Time = %f \n", event_name, - duration_time.count()); + duration_time.count()); } profile_data.append("\n"); previous_time = time_reading; // set the previous time reading to the current // profiled time - } } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 9250810a2010a235074c0d29b8fe8bd63650324c..939f6e061985b27b4369b37925c0d2bf6a7c9a5d 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -1,11 +1,11 @@ //===--------------------------- tensor_runtime_cpu.cc --------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the custom implementation of non-approximated and -// approximated versions of tensor operations to execute on CPUs. The -// software approximations implemented for tensor convolutions are feature -// sampling and perforation for FP32 compute precisions only. +// +// This file consists of the custom implementation of non-approximated and +// approximated versions of tensor operations to execute on CPUs. The +// software approximations implemented for tensor convolutions are feature +// sampling and perforation for FP32 compute precisions only. // //===----------------------------------------------------------------------===// @@ -29,7 +29,7 @@ #include <string> #include <vector> #include <math.h> -#include<bits/stdc++.h> +#include <bits/stdc++.h> #include <pthread.h> #include <omp.h> @@ -39,1081 +39,1130 @@ #include "tensor_cpu_runtime.h" void llvm_hpvm_initTensorRtCPU() { - // NOTE: Do Nothing + // NOTE: Do Nothing } void llvm_hpvm_cleanupTensorRtCPU() { - // NOTE: Do Nothing + // NOTE: Do Nothing } void hpvm_request_tensorCPU(void *tensor, int destination) { - // NOTE: Do Nothing + // NOTE: Do Nothing } - + std::vector<void *> PtrVect; void freeBatchMemory() { - for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { - free(*it); - } - PtrVect.erase(PtrVect.begin(), PtrVect.end()); + for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { + free(*it); + } + PtrVect.erase(PtrVect.begin(), PtrVect.end()); } - -int getTypeSizeCPU(int data_type) __attribute__((always_inline)); +int getTypeSizeCPU(int data_type) __attribute__((always_inline)); inline int getTypeSizeCPU(int data_type) { - return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); + return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); } -void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); -inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) { - int type_size = getTypeSizeCPU(data_type); - size_t size_in_bytes = type_size * num_elems; - tensor->size_in_bytes = size_in_bytes; +void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) + __attribute__((always_inline)); +inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, + size_t num_elems) { + int type_size = getTypeSizeCPU(data_type); + size_t size_in_bytes = type_size * num_elems; + tensor->size_in_bytes = size_in_bytes; } -void allocateMemCPU(struct Tensor *tensor, int data_type, - size_t num_elems, bool freeMemory = true) __attribute__((always_inline)); -inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) { - setSizeInBytesCPU(tensor, data_type, num_elems); - tensor->data_type = data_type; - tensor->num_elems = num_elems; - tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host - if(freeMemory) - PtrVect.push_back(tensor->host_data); +void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, + bool freeMemory = true) __attribute__((always_inline)); +inline void allocateMemCPU(struct Tensor *tensor, int data_type, + size_t num_elems, bool freeMemory) { + setSizeInBytesCPU(tensor, data_type, num_elems); + tensor->data_type = data_type; + tensor->num_elems = num_elems; + tensor->host_data = + (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host + if (freeMemory) + PtrVect.push_back(tensor->host_data); } -void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline)); -inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); - } - memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough? +void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) + __attribute__((always_inline)); +inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, + size_t size_in_bytes) { + Tensor *tensor = (Tensor *)tensor_ptr; + if (tensor->size_in_bytes != size_in_bytes) { + printf("The destination and source sizes don't match"); + } + memcpy(tensor->host_data, data_ptr, + size_in_bytes); // Is this efficient enough? } void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size, - bool freeMemory = true) __attribute__((always_inline)); -inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, - size_t dim4_size, bool freeMemory) { - struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - if(freeMemory) - PtrVect.push_back(tensor); - allocateMemCPU(tensor, data_type, num_elems, freeMemory); - - // Setting the tensor dimensions - size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - tensor->data_placement = HOST; - return tensor; + size_t dim2_size, size_t dim3_size, size_t dim4_size, + bool freeMemory = true) __attribute__((always_inline)); +inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, + size_t dim4_size, bool freeMemory) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + if (freeMemory) + PtrVect.push_back(tensor); + allocateMemCPU(tensor, data_type, num_elems, freeMemory); + + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + tensor->data_placement = HOST; + return tensor; } -void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int output_size = output_width * output_height; - printf("--CREATE 4D TENSOR\n"); - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - printf("CREATED 4D TENSOR\n"); - long int conv_data_size = - sizeof(float) * num_filter_elem * output_height * output_width * batch_size; - float *host_data = (float *) malloc(conv_data_size); - printf("host data: %p\n", host_data); - printf("number of batches: %d\n", batch_size); - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } +void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / + horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int output_size = output_width * output_height; + printf("--CREATE 4D TENSOR\n"); + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + printf("CREATED 4D TENSOR\n"); + long int conv_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(conv_data_size); + printf("host data: %p\n", host_data); + printf("number of batches: %d\n", batch_size); + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } + } + } + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(host_data); - printf("END: %p\n", output); - return output; + } + free(host_data); + printf("END: %p\n", output); + return output; } -void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem - * output_height * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *) malloc(reduced_filer_size); - - float fac = (((float) skip_every) / ((float) skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create reduced filter - omp_set_num_threads(4); - #pragma omp parallel for - for(int f = 0; f < num_filters; f++) { - for(int i = 0; i < reduced_num_filter_elem; i++) { - int ch = i / reduced_filter_dim; - int offset = (start + ch) % skip_every; - int in_index; - if(i < offset) { - in_index = i; - } else { - in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) - + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1; - } - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; +void *tensorRegularFilterSamplingConvolutionCPU( + void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int compute_precision, int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + const int reduced_filer_size = + sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *)malloc(reduced_filer_size); + + float fac = (((float)skip_every) / ((float)skip_every - 1)); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create reduced filter + omp_set_num_threads(4); +#pragma omp parallel for + for (int f = 0; f < num_filters; f++) { + for (int i = 0; i < reduced_num_filter_elem; i++) { + int ch = i / reduced_filter_dim; + int offset = (start + ch) % skip_every; + int in_index; + if (i < offset) { + in_index = i; + } else { + in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) + + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } + } + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + const int ch = fi / reduced_filter_dim; + const int offset = (start + ch) % skip_every; + if (fi < offset) { + in_index = fi; + } else { + in_index = + ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int i = + (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - const int ch = fi / reduced_filter_dim; - const int offset = (start + ch) % skip_every; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size - + output_index * reduced_num_filter_elem + fi; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] * + reduced_kernels[p * reduced_num_filter_elem + k]; } - - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m - + b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] - * reduced_kernels[p * reduced_num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } - } - + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(reduced_kernels); - free(host_data); - - return output; + } + free(reduced_kernels); + free(host_data); + + return output; } -void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem - * output_height * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *) malloc(reduced_filer_size); - - float fac = (((float) skip_every) / ((float) skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create Reduced filter - omp_set_num_threads(4); - #pragma omp parallel for - for(int f = 0; f < num_filters; f++) { - for(int i = 0; i < start; i++) { - reduced_kernels[f * reduced_num_filter_elem + i] = - host_filter[num_filter_elem * f + i]; - } - #pragma omp simd - for(int i = start; i < reduced_num_filter_elem; i++) { - int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) - + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1; - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; - } +void *tensorIrregularFilterSamplingConvolutionCPU( + void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int compute_precision, int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + const int reduced_filer_size = + sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *)malloc(reduced_filer_size); + + float fac = (((float)skip_every) / ((float)skip_every - 1)); + + // Create Reduced filter + omp_set_num_threads(4); +#pragma omp parallel for + for (int f = 0; f < num_filters; f++) { + for (int i = 0; i < start; i++) { + reduced_kernels[f * reduced_num_filter_elem + i] = + host_filter[num_filter_elem * f + i]; } - - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - int offset = start; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int ch = in_index / (kernel_width * kernel_height); - const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size - + output_index * reduced_num_filter_elem + fi; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } +#pragma omp simd + for (int i = start; i < reduced_num_filter_elem; i++) { + int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) + + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + + start - 1; + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } + } + +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + int offset = start; + if (fi < offset) { + in_index = fi; + } else { + in_index = + ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int ch = in_index / (kernel_width * kernel_height); + const int i = + (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } + } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m - + b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] - * reduced_kernels[p * reduced_num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] * + reduced_kernels[p * reduced_num_filter_elem + k]; } - + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(reduced_kernels); - free(host_data); - - return output; -} + } + free(reduced_kernels); + free(host_data); -void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int row, int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - full_output_height, full_output_width); - float * __restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_height - start) % row > 0; - int output_height = - full_output_height - ((full_output_height - start) / row) - remainder; - - int output_width = full_output_width; - float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters - * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height - * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); + return output; +} - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - int inH; - if(h < start) { - inH = h * vertical_stride - vertical_pad; - } else { - int h_index = ((h - start + 1) * row) / (row - 1) - + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; - inH = h_index * vertical_stride - vertical_pad; - } - for(int w = 0; w < output_width; w++) { - int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } +void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int row, + int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, full_output_height, full_output_width); + float *__restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_height - start) % row > 0; + int output_height = + full_output_height - ((full_output_height - start) / row) - remainder; + + int output_width = full_output_width; + float *output_data = (float *)malloc( + sizeof(float) * batch_size * num_filters * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + int inH; + if (h < start) { + inH = h * vertical_stride - vertical_pad; + } else { + int h_index = ((h - start + 1) * row) / (row - 1) + + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; + inH = h_index * vertical_stride - vertical_pad; + } + for (int w = 0; w < output_width; w++) { + int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } + } + } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } + } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for(int h = 0; h < full_output_height; h++) { - for(int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size - + p * full_output_size + h * full_output_width + w; - if(h < start) { - int output_index = b * num_filters * output_size - + p * output_size + h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(h == full_output_height - 1) { - int output_index = b * num_filters * output_size + p * output_size - + (output_height - 1) * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(h == 0) { - int output_index = b * num_filters * output_size - + p * output_size + 0 * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if((h - start) % row == 0) { - int row_index = h - ((h + 1 - start) / row); - int output_index = b * num_filters * output_size + p * output_size - + row_index * output_width + w; - full_output_data[full_output_index] = - (output_data[output_index] + output_data[output_index - output_width]) / 2; - } else { - int remainder = ((h + 1 - start) % row) > 0; - int row_index = h - ((h + 1 - start) / row) - remainder; - int output_index = b * num_filters * output_size + p * output_size - + row_index * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } - } - } - } + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for (int h = 0; h < full_output_height; h++) { + for (int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + + w; + if (h < start) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (h == full_output_height - 1) { + int output_index = b * num_filters * output_size + p * output_size + + (output_height - 1) * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (h == 0) { + int output_index = b * num_filters * output_size + p * output_size + + 0 * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if ((h - start) % row == 0) { + int row_index = h - ((h + 1 - start) / row); + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = + (output_data[output_index] + + output_data[output_index - output_width]) / + 2; + } else { + int remainder = ((h + 1 - start) % row) > 0; + int row_index = h - ((h + 1 - start) / row) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } + } + } } - free(output_data); - free(host_data); + } + free(output_data); + free(host_data); - return full_output; + return full_output; } -void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int col, int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - full_output_height, full_output_width); - float * __restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_width - start) % col > 0; - int output_width = full_output_width - ((full_output_width - start) / col) - remainder; - - int output_height = full_output_height; - float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters - * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height - * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - int inH = h * vertical_stride - vertical_pad; - for(int w = 0; w < output_width; w++) { - int inW; - if(w < start) { - inW = w * horizontal_stride - horizontal_pad; - } else { - int w_index = ((w - start + 1) * col) / (col - 1) - + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; - inW = w_index * horizontal_stride - horizontal_pad; - } - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } - } - } - - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m - + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; +void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int col, + int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, full_output_height, full_output_width); + float *__restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_width - start) % col > 0; + int output_width = + full_output_width - ((full_output_width - start) / col) - remainder; + + int output_height = full_output_height; + float *output_data = (float *)malloc( + sizeof(float) * batch_size * num_filters * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + int inH = h * vertical_stride - vertical_pad; + for (int w = 0; w < output_width; w++) { + int inW; + if (w < start) { + inW = w * horizontal_stride - horizontal_pad; + } else { + int w_index = ((w - start + 1) * col) / (col - 1) + + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; + inW = w_index * horizontal_stride - horizontal_pad; + } + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } + } + } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for(int h = 0; h < full_output_height; h++) { - for(int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size - + p * full_output_size + h * full_output_width + w; - if(w < start) { - int output_index = b * num_filters * output_size - + p * output_size + h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(w == full_output_width - 1) { - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + output_width - 1; - full_output_data[full_output_index] = output_data[output_index]; - } else if(w == 0) { - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + 0; - full_output_data[full_output_index] = output_data[output_index]; - } else if((w - start) % col == 0) { - int col_index = w - ((w + 1 - start) / col); - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + col_index; - full_output_data[full_output_index] = - (output_data[output_index] + output_data[output_index - 1]) / 2; - } else { - int remainder = ((w + 1 - start) % col) > 0; - int col_index = w - ((w + 1 - start) / col) - remainder; - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + col_index; - full_output_data[full_output_index] = output_data[output_index]; - } - } - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(output_data); - free(host_data); - - return full_output; -} -void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start) { - if(row > 1) { - printf("ROW PERFORATION\n"); - return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, horizontal_stride, conv_mode, - compute_precision, row, start); - } - if(col > 1) { - printf("COL PERFORATION\n"); - return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, horizontal_stride, conv_mode, - compute_precision, col, start); - } - if(skip_every > 1) { - printf("INPUT FILTERING\n"); - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - - if(!(kernel_height * kernel_width % skip_every)) { - return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, - vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, - compute_precision, skip_every, start); + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for (int h = 0; h < full_output_height; h++) { + for (int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + + w; + if (w < start) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (w == full_output_width - 1) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + output_width - 1; + full_output_data[full_output_index] = output_data[output_index]; + } else if (w == 0) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + 0; + full_output_data[full_output_index] = output_data[output_index]; + } else if ((w - start) % col == 0) { + int col_index = w - ((w + 1 - start) / col); + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - 1]) / 2; + } else { + int remainder = ((w + 1 - start) % col) > 0; + int col_index = w - ((w + 1 - start) / col) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = output_data[output_index]; + } } - return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, - vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, - compute_precision, skip_every, start); + } } - printf("---REGULAR CONV\n"); - return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision); + } + free(output_data); + free(host_data); + + return full_output; } -void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - - Tensor *input = (Tensor *)input_ptr; +void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start) { + if (row > 1) { + printf("ROW PERFORATION\n"); + return tensorRowPerfConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, row, start); + } + if (col > 1) { + printf("COL PERFORATION\n"); + return tensorColPerfConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, col, start); + } + if (skip_every > 1) { + printf("INPUT FILTERING\n"); Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int filter_dim = kernel_height * kernel_width; - const int num_filter_elem = filter_dim * channels; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, - output_height * output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int conv_data_size = - sizeof(float) * num_filter_elem * output_height * output_width * batch_size; - float *host_data = (float *) malloc(conv_data_size); - - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } + + if (!(kernel_height * kernel_width % skip_every)) { + return tensorRegularFilterSamplingConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, skip_every, start); + } + return tensorIrregularFilterSamplingConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, skip_every, start); + } + printf("---REGULAR CONV\n"); + return tensorRegularConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision); +} + +void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int filter_dim = kernel_height * kernel_width; + const int num_filter_elem = filter_dim * channels; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, channels, output_height * output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int conv_data_size = sizeof(float) * num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(conv_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - for (int ch = 0; ch < channels; ch++) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < filter_dim; ++k) { - int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k]; - } - output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum; - } - } + } + } + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + for (int ch = 0; ch < channels; ch++) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < filter_dim; ++k) { + int input_index = k + ch * filter_dim + num_filter_elem * m + + b * num_filter_elem * output_size; + sum += host_data[input_index] * + host_filter[p * num_filter_elem + ch * filter_dim + k]; + } + output_data[b * (output_size * num_filters * channels) + + p * output_size * channels + ch * output_size + m] = sum; } + } } + } - free(host_data); - return output; + free(host_data); + return output; } -void* tensorAddCPU(void *x_ptr, void *bias_ptr) { - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; - - float * __restrict__ x_data = (float *)x->host_data; - float * __restrict__ bias_data = (float *)bias->host_data; - int n = x->dims.dim_sizes[0]; - int c = x->dims.dim_sizes[1]; - int h = x->dims.dim_sizes[2]; - int w = x->dims.dim_sizes[3]; - - if(x->num_elems == bias->num_elems) { - int const1 = c * h * w; - int const2 = h * w; - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - #pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * const1 + j * const2 + (k * w) + l] += - bias_data[i * const1 + j * const2 + (k*w) + l]; - } - } - } +void *tensorAddCPU(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; + + float *__restrict__ x_data = (float *)x->host_data; + float *__restrict__ bias_data = (float *)bias->host_data; + int n = x->dims.dim_sizes[0]; + int c = x->dims.dim_sizes[1]; + int h = x->dims.dim_sizes[2]; + int w = x->dims.dim_sizes[3]; + + if (x->num_elems == bias->num_elems) { + int const1 = c * h * w; + int const2 = h * w; + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { +#pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * const1 + j * const2 + (k * w) + l] += + bias_data[i * const1 + j * const2 + (k * w) + l]; + } } - } else { - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - #pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; - } - } - } - } + } + } + } else { + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { +#pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; + } + } + } } - - return x; + } + + return x; } float max(float v1, float v2) __attribute__((always_inline)); -inline float maximum(float v1, float v2){ - return (v1 < v2) ? v2 : v1; -} +inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; } void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { - - Tensor *input = (Tensor *)input_ptr; - float * __restrict__ input_data = (float *)input->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - - int output_height = - 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); - int output_width = - 1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride); - - int center_x = (window_width - 1) / 2 - horizontal_pad; - int center_y = (window_height - 1) / 2 - vertical_pad; - int x_radius = (window_width - 1) / 2; - int y_radius = (window_height - 1) / 2; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - omp_set_num_threads(4); - #pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - float val = (poolFunction == 0) ? -3.40282e+38 : 0; - int y_radius_var = y_radius - r; - int y_radius_var_max = y_radius_var + image_height; - int x_radius_var = x_radius - c; - int x_radius_var_max = x_radius_var + image_width; - int ki_min = (y_radius_var > 0) ? - ((y_radius_var < window_height) ? y_radius_var : -1) : 0; - int ki_max = (y_radius_var_max < window_height) ? - ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; - int kj_min = (x_radius_var > 0) ? - ((x_radius_var < window_width) ? x_radius_var : -1) : 0; - int kj_max = (x_radius_var_max < window_width) ? - ((x_radius_var_max >= 0) ? x_radius_var_max : -1) : window_width; - - if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 - && ki_max != -1 && kj_min != -1 && kj_max != -1) { - if(!poolFunction) { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val = maximum( - val, - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + (c - x_radius + kj)]); - } - } - } else { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val += input_data[b * (channels * image_height * image_width) - + ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + (c - x_radius + kj)]; - } - } - } - } - if (poolFunction == 1) { - val /= window_height * window_width; - } - output_data[b * (channels * output_height * output_width) + - ch * (output_height * output_width) + ii * output_width + jj] = val; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + Tensor *input = (Tensor *)input_ptr; + float *__restrict__ input_data = (float *)input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + + int output_height = + 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); + int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / + horizontal_stride); + + int center_x = (window_width - 1) / 2 - horizontal_pad; + int center_y = (window_height - 1) / 2 - vertical_pad; + int x_radius = (window_width - 1) / 2; + int y_radius = (window_height - 1) / 2; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + float val = (poolFunction == 0) ? -3.40282e+38 : 0; + int y_radius_var = y_radius - r; + int y_radius_var_max = y_radius_var + image_height; + int x_radius_var = x_radius - c; + int x_radius_var_max = x_radius_var + image_width; + int ki_min = + (y_radius_var > 0) + ? ((y_radius_var < window_height) ? y_radius_var : -1) + : 0; + int ki_max = (y_radius_var_max < window_height) + ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1) + : window_height; + int kj_min = (x_radius_var > 0) + ? ((x_radius_var < window_width) ? x_radius_var : -1) + : 0; + int kj_max = (x_radius_var_max < window_width) + ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1) + : window_width; + + if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 && + ki_max != -1 && kj_min != -1 && kj_max != -1) { + if (!poolFunction) { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val = maximum( + val, + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + + (c - x_radius + kj)]); } + } + } else { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val += + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + + (c - x_radius + kj)]; + } + } } + } + if (poolFunction == 1) { + val /= window_height * window_width; + } + output_data[b * (channels * output_height * output_width) + + ch * (output_height * output_width) + ii * output_width + + jj] = val; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } } + } } - - return output; + } + + return output; } void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - omp_set_num_threads(4); - #pragma omp parallel for - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = tanhf(input_data[i]); - } - - return input; + Tensor *input = (Tensor *)input_ptr; + + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + omp_set_num_threads(4); +#pragma omp parallel for + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = tanhf(input_data[i]); + } + + return input; } void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { - Tensor *lhs = (Tensor *)lhs_ptr; - Tensor *rhs = (Tensor *)rhs_ptr; - - int m = lhs->dims.dim_sizes[0]; - int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; - - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); - - float * __restrict__ lhs_arr = (float *)lhs->host_data; - float * __restrict__ rhs_arr = (float *)rhs->host_data; - float * __restrict__ output_arr = (float *)output->host_data; - - int k = 1; - #pragma unroll 4 // Can we unroll more??? - for (int j = 1; j < lhs->dims.num_dims; j++) { - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - float *tran_rhs = (float *) malloc(sizeof(float) * k * n); - omp_set_num_threads(4); - #pragma omp parallel for simd - for (int l = 0; l < k; l++) { - for (int j = 0; j < n; j++) { - tran_rhs[j * k + l] = rhs_arr[l * n + j]; - } + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; + + int m = lhs->dims.dim_sizes[0]; + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); + + float *__restrict__ lhs_arr = (float *)lhs->host_data; + float *__restrict__ rhs_arr = (float *)rhs->host_data; + float *__restrict__ output_arr = (float *)output->host_data; + + int k = 1; +#pragma unroll 4 // Can we unroll more??? + for (int j = 1; j < lhs->dims.num_dims; j++) { + k = k * lhs->dims.dim_sizes[j]; // input neurons + } + float *tran_rhs = (float *)malloc(sizeof(float) * k * n); + omp_set_num_threads(4); +#pragma omp parallel for simd + for (int l = 0; l < k; l++) { + for (int j = 0; j < n; j++) { + tran_rhs[j * k + l] = rhs_arr[l * n + j]; } - - #pragma omp parallel for - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - float sum = 0.0; - #pragma omp simd reduction(+:sum) - for (int l = 0; l < k; l++) { - sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; - } - output_arr[i * n + j] = sum; - } + } + +#pragma omp parallel for + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0; +#pragma omp simd reduction(+ : sum) + for (int l = 0; l < k; l++) { + sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; + } + output_arr[i * n + j] = sum; } - free(tran_rhs); - return output; + } + free(tran_rhs); + return output; } void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *logits = (float *)input->host_data; - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; - - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - float x = 0; - for(int j = i*c; j < c + i*c; j++) { - logits[j] = expf(logits[j]); - } - - #pragma omp simd reduction(+:x) - for(int j = i*c; j < i*c+c; j++) { - x += logits[j]; - } - - #pragma omp simd - for(int j = i*c; j < i*c + c; j++) { - logits[j] /= x; - } + Tensor *input = (Tensor *)input_ptr; + + float *logits = (float *)input->host_data; + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + float x = 0; + for (int j = i * c; j < c + i * c; j++) { + logits[j] = expf(logits[j]); } - return input; -} - -void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { - - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_beta = (float *)beta->host_data; - float * __restrict__ host_gamma = (float *)gamma->host_data; - float * __restrict__ host_mean = (float *)mean->host_data; - float * __restrict__ host_variance = (float *)variance->host_data; - - float alpha_val = 1.0f, beta_val = 0.0f; - size_t num_elems = input->num_elems; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int image_dim = image_height * image_width; +#pragma omp simd reduction(+ : x) + for (int j = i * c; j < i * c + c; j++) { + x += logits[j]; + } - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - float mean = 0; - #pragma omp simd reduction(+:mean) - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - mean += host_image[index]; - } - mean = mean / channels; - - float variance = 0; - #pragma omp simd reduction(+:variance) - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - float tmp = host_image[index] - mean; - variance += (tmp * tmp); - } - variance = variance / channels; - - #pragma omp simd - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - host_image[index] = host_beta[ch] - + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance))); - } - } +#pragma omp simd + for (int j = i * c; j < i * c + c; j++) { + logits[j] /= x; } - return input; + } + + return input; } - void *tensorReluCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - #pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; +void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_beta = (float *)beta->host_data; + float *__restrict__ host_gamma = (float *)gamma->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int image_dim = image_height * image_width; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + float mean = 0; +#pragma omp simd reduction(+ : mean) + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + mean += host_image[index]; + } + mean = mean / channels; + + float variance = 0; +#pragma omp simd reduction(+ : variance) + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + float tmp = host_image[index] - mean; + variance += (tmp * tmp); + } + variance = variance / channels; + +#pragma omp simd + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + host_image[index] = + host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) / + sqrt(epsilon + variance))); + } } + } + return input; +} - return input; +void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + +#pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; + } + + return input; } void *tensorRelu2CPU(void *input_ptr, float min, float max) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - #pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? - max : input_data[i]); - } - - return input; -} + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + +#pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < min) + ? min + : ((input_data[i] > max) ? max : input_data[i]); + } + + return input; +} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 319936b482c455af2fcc0280adb15d7c126c088a..253f7614337908e72c82ba986f860dd58c7c9b3f 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -1,8 +1,9 @@ -/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn -** -** Author: Hashim Sharif -** Email: hsharif3@illinois.edu -*/ +/* This file includes the API implementation of the HPVM tensor runtime built on + *cublas, cudnn + ** + ** Author: Hashim Sharif + ** Email: hsharif3@illinois.edu + */ #include <stdio.h> #include <stdarg.h> @@ -31,7 +32,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "tensor_runtime.h" #include "tensor_utils.h" @@ -46,236 +46,177 @@ #include "half_precision_api.h" #include "approx_simulation.h" +// FIXIT: tensorAdd currently only works for 4D tensors +void *tensorAdd(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; - - -// FIXIT: tensorAdd currently only works for 4D tensors -void* tensorAdd(void* x_ptr, void* bias_ptr){ - - Tensor* x = (Tensor*) x_ptr; - Tensor* bias = (Tensor*) bias_ptr; - - INFO("*** TensorAdd \n"); + INFO("*** TensorAdd \n"); profileEvent("Add"); - + float alpha = 1.0f; - //float beta = 0.0f; + // float beta = 0.0f; hostToDeviceCopy(x); hostToDeviceCopy(bias); convertToFP32(x); convertToFP32(bias); - DEBUG("x->num_elems = %d \n", x->num_elems); DEBUG("bias->num_elems = %d \n", bias->num_elems); - if(cudnnHandle == NULL){ - ERROR("cudnnHandle NOT initialized!! \n"); + if (cudnnHandle == NULL) { + ERROR("cudnnHandle NOT initialized!! \n"); } - + // FIXIT: routine fails for 3D tensors checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc, - bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data)); + bias->gpu_data, &alpha, x->tensor_desc, + x->gpu_data)); profileEvent("Add_end", true); - #ifdef ERROR_INJECTION_ENABLED - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - void* error_norms = tensorAddError(x, op_acc); - add_norms(error_norms, "tensorAdd", op_acc); - add_bias_overheads(x, op_acc); - op_counter++; - - #endif - - return x; } - // FIXIT: Generalize all of the routines for types {half, float, double} -void* tensorConvolution(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - +void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups) { + INFO("*** TensorConvolution \n"); profileEvent("Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; - + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - if(conv_mode == 0) + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - - // TODO: Support other cases; + + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); convertToFP32(input); convertToFP32(filter); - - DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; - } - - + } + cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision // NOTE: Set conv groups for grouped convolution e.g., depthwise convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - int n, c, h, w; // output dimensions + int n, c, h, w; // output dimensions // Find dimension of convolution output - if(input->tensor_desc == NULL || filter->filter_desc == NULL) + if (input->tensor_desc == NULL || filter->filter_desc == NULL) ERROR("Input or Filter descriptor is NULL"); - - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - if(input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if(input->data_format == CUDNN_TENSOR_NHWC){ + Tensor *output; + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); + + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); // Debugging info prints printTensorDescInfo(input); printTensorDescInfo(filter); printTensorDescInfo(output); - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); + DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Currently using GEMM based convolution - other algorithms available - // TODO: Benchmark other convolution algorithms e.g., winograd + // TODO: Benchmark other convolution algorithms e.g., winograd convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + void *workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - profileEvent("Conv_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - - op_counter++; - - #endif - - return output; } - - // NOTE: Supports Max and Avg Pooling -void* tensorPooling(void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ +void *tensorPooling(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { INFO("*** TensorPooling \n"); profileEvent("Pool"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnPoolingDescriptor_t poolDesc; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -285,83 +226,57 @@ void* tensorPooling(void* input_ptr, convertToFP32(input); - - checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); + checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; - int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride; + int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / + vertical_stride; h = h + 1; - int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride; + int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / + horizontal_stride; w = w + 1; - DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", - n, c, h, w, input->dims.dim_sizes[2], input->dims.dim_sizes[3]); - + DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", n, c, h, w, + input->dims.dim_sizes[2], input->dims.dim_sizes[3]); - Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w); // Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); // FIXIT: The output tensor is hardcoded to NCHW - checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - n, c, - h, w)); + checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, n, c, h, w)); // Select between Max-Pooling and Avg-Pooling cudnnPoolingMode_t pool_mode; - if(poolFunction == 0) + if (poolFunction == 0) pool_mode = CUDNN_POOLING_MAX; - else if(poolFunction == 1) + else if (poolFunction == 1) pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - - checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc, - pool_mode, - CUDNN_PROPAGATE_NAN, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride)); - - checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, input->tensor_desc, - input->gpu_data, &beta, output->tensor_desc, output->gpu_data)); - - profileEvent("Pool_end", true); + checkCUDNN(cudnnSetPooling2dDescriptor( + poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width, + vertical_pad, horizontal_pad, vertical_stride, horizontal_stride)); - #ifdef ERROR_INJECTION_ENABLED + checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, + input->tensor_desc, input->gpu_data, &beta, + output->tensor_desc, output->gpu_data)); - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorPooling", op_acc); - add_pool_overheads(input, window_height, vertical_stride, op_acc); - - op_counter++; - - #endif - - + profileEvent("Pool_end", true); return output; } - - - - -/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */ -void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ +/* Reference Implementation based on: + * https://gist.github.com/peterwittek/6303527 */ +void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) { INFO("*** TensorGemmGPU \n"); profileEvent("Mul"); - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); @@ -371,30 +286,30 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ // 'm' holds the batch dimension - assuming NCHW format Tensors int m = lhs->dims.dim_sizes[0]; // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int k = 1; - + // Flattening the dimensions after the batch dimension // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1 ; j < lhs->dims.num_dims; j++){ + for (int j = 1; j < lhs->dims.num_dims; j++) { k = k * lhs->dims.dim_sizes[j]; // input neurons } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; // Dimension-note: Check if k is same across the two tensors DEBUG("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ + if (rhs_k != k) { ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); } - Tensor* output = NULL; + Tensor *output = NULL; DEBUG("Creating new TENSOR * \n"); - output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); + output = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); - DEBUG("Changing placement *\n"); // Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); DEBUG("Changed Placement * \n\n"); @@ -404,175 +319,105 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ convertToFP32(lhs); convertToFP32(rhs); - DEBUG("CuBlasSgemm *\n"); - + // INFO: cuBlas uses column-major format // INFO: The leading dimension is just the FIRST Dimension - // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects - checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n, m, k, - &alpha, - (float*) rhs->gpu_data, n, - (float*) lhs->gpu_data, k, - &beta, - (float*) output->gpu_data, n)); - - - profileEvent("Mul_end", true); - - + // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN + // expects + checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, + &alpha, (float *)rhs->gpu_data, n, + (float *)lhs->gpu_data, k, &beta, + (float *)output->gpu_data, n)); - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorGemm", op_acc); - add_gemm_overheads(lhs_ptr, rhs_ptr, op_acc); - - op_counter++; - - #endif - - + profileEvent("Mul_end", true); return output; } - - - - - -void* tensorRelu(void* input_ptr){ +void *tensorRelu(void *input_ptr) { DEBUG("*** TensorRelu \n"); profileEvent("Relu"); - Tensor* input = (Tensor*) input_ptr; - + Tensor *input = (Tensor *)input_ptr; + cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); convertToFP32(input); - - + checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); profileEvent("Relu_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - - void* error_norms = tensorAddError(input, op_acc); - add_norms(error_norms, "tensorRelu", op_acc); - add_relu_overheads(input, op_acc); - op_counter++; - #endif - - return input; } - // Think: Should Softmax be broken into multiple IR operations? -void* tensorSoftmax(void* input_ptr){ +void *tensorSoftmax(void *input_ptr) { INFO("*** TensorSoftmax \n"); profileEvent("Softmax"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - convertToFP32(input); - - checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + convertToFP32(input); - deviceToHostCopy(input); + checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); + + deviceToHostCopy(input); profileEvent("Softmax_end", true); - + return input; } - - - -void* tensorRelu2(void* input_ptr, float min, float max){ +void *tensorRelu2(void *input_ptr, float min, float max) { INFO("*** TensorClippedRelu *** \n"); profileEvent("Relu"); cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; - - Tensor* input = (Tensor*) input_ptr; + + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); convertToFP32(input); - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); - checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, - CUDNN_PROPAGATE_NAN, max)); + checkCUDNN(cudnnSetActivationDescriptor( + reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, max)); checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); - - profileEvent("Relu_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - void* error_norms = tensorAddError(input, op_acc); - add_norms(error_norms, "tensorClippedRelu", op_acc); - add_relu_overheads(input, op_acc); - op_counter++; - #endif - - return input; } - -void* tensorTanh(void* input_ptr){ +void *tensorTanh(void *input_ptr) { INFO("*** TensorTanh \n"); profileEvent("Tanh"); - Tensor* input = (Tensor*) input_ptr; - + Tensor *input = (Tensor *)input_ptr; + cudnnActivationDescriptor_t tanhDesc; float alpha = 1.0f, beta = 0.0f; @@ -580,55 +425,36 @@ void* tensorTanh(void* input_ptr){ convertToFP32(input); - checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc)); checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); profileEvent("Tanh_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - void* error_norms = tensorAddError(input, op_acc); - add_norms(error_norms, "tensorTanh", op_acc); - add_relu_overheads(input, op_acc); - op_counter++; - #endif - - return input; } - - - -void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { INFO("*** TensorBatchNorm \n"); profileEvent("BatchNorm"); - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + Tensor *mean = (Tensor *)mean_ptr; + Tensor *variance = (Tensor *)variance_ptr; - if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || variance == NULL){ + if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || + variance == NULL) { ERROR("NULL Input Tensor"); } - + float alpha_val = 1.0f, beta_val = 0.0f; hostToDeviceCopy(input); hostToDeviceCopy(gamma); @@ -638,149 +464,127 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, convertToFP32(input); - - - checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL, - &alpha_val, &beta_val, - input->tensor_desc, input->gpu_data, - input->tensor_desc, input->gpu_data, - gamma->tensor_desc, gamma->gpu_data, - beta->gpu_data, mean->gpu_data, - variance->gpu_data, - epsilon)); + checkCUDNN(cudnnBatchNormalizationForwardInference( + cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val, + input->tensor_desc, input->gpu_data, input->tensor_desc, input->gpu_data, + gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, mean->gpu_data, + variance->gpu_data, epsilon)); profileEvent("BatchNorm_end", true); - - - #ifdef ERROR_INJECTION_ENABLED - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - int op_acc = op_accuracies[op_counter]; - void* error_norms = tensorAddError(input, op_acc); - add_norms(error_norms, "tensorBatchNorm", op_acc); - add_relu_overheads(input, op_acc); - op_counter++; - #endif - - return input; } - - - // TODO: benchmark performance of tensorSplit -void** tensorSplit(void* tensor_ptr, int num_splits, int split_dim){ +void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) { - INFO("*** TensorSplit \n"); + INFO("*** TensorSplit \n"); profileEvent("tensorSplit"); - Tensor* tensor = (Tensor*) tensor_ptr; - + Tensor *tensor = (Tensor *)tensor_ptr; + deviceToHostCopy(tensor); // Splitting done on the host - Tensor** splits = (Tensor**) malloc(sizeof(Tensor*) * num_splits); - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensor->dims.num_dims); - for(unsigned int i = 0; i < tensor->dims.num_dims; i++){ + Tensor **splits = (Tensor **)malloc(sizeof(Tensor *) * num_splits); + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * tensor->dims.num_dims); + for (unsigned int i = 0; i < tensor->dims.num_dims; i++) { dim_sizes[i] = tensor->dims.dim_sizes[i]; } - dim_sizes[split_dim] = tensor->dims.dim_sizes[split_dim] / num_splits; - if(dim_sizes[split_dim] < 1) + if (dim_sizes[split_dim] < 1) ERROR("Split Dimension < 1 after splitting"); size_t copy_size = getTypeSize(tensor->data_type); - for(unsigned int i = split_dim; i < tensor->dims.num_dims; i++){ + for (unsigned int i = split_dim; i < tensor->dims.num_dims; i++) { copy_size = copy_size * dim_sizes[i]; } - - for(unsigned int i = 0; i < num_splits; i++){ - DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n", - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + for (unsigned int i = 0; i < num_splits; i++) { + + DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, " + "dim_sizes[3] = %d \n", + dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + + Tensor *split = (Tensor *)create4DTensor( + tensor->data_type, tensor->data_format, dim_sizes[0], dim_sizes[1], + dim_sizes[2], dim_sizes[3]); - Tensor* split = (Tensor*) create4DTensor(tensor->data_type, tensor->data_format, - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); - size_t copy_start = i * copy_size; size_t copy_stride = num_splits * copy_size; - DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, tensor->size_in_bytes = %d \n", - copy_size, copy_start, copy_stride, tensor->size_in_bytes); + DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, " + "tensor->size_in_bytes = %d \n", + copy_size, copy_start, copy_stride, tensor->size_in_bytes); int index = 0; - while(copy_start + copy_size <= tensor->size_in_bytes){ - memcpy(((char*) split->host_data + (index * copy_size)), - ((char*)tensor->host_data + copy_start), - copy_size); + while (copy_start + copy_size <= tensor->size_in_bytes) { + memcpy(((char *)split->host_data + (index * copy_size)), + ((char *)tensor->host_data + copy_start), copy_size); copy_start += copy_stride; index++; } - - splits[i] = split; + + splits[i] = split; } profileEvent("tensorSplit_end", true); - return (void**) splits; + return (void **)splits; } +void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) { -void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){ - - INFO("*** TensorConcat \n"); + INFO("*** TensorConcat \n"); profileEvent("tensorConcat"); - Tensor** tensors = (Tensor**) tensors_ptr; + Tensor **tensors = (Tensor **)tensors_ptr; - for(int i = 0; i < num_splits; i++){ + for (int i = 0; i < num_splits; i++) { deviceToHostCopy(tensors[i]); // Concatenation done on the host } - + // The no of dimensions of concatenated tensor are the same - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensors[0]->dims.num_dims); - for(unsigned int i = 0; i < tensors[0]->dims.num_dims; i++){ + size_t *dim_sizes = + (size_t *)malloc(sizeof(size_t) * tensors[0]->dims.num_dims); + for (unsigned int i = 0; i < tensors[0]->dims.num_dims; i++) { dim_sizes[i] = tensors[0]->dims.dim_sizes[i]; } - + size_t copy_size = getTypeSize(tensors[0]->data_type); - for(unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++){ + for (unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++) { copy_size = copy_size * dim_sizes[i]; } dim_sizes[split_dim] = dim_sizes[split_dim] * num_splits; - if(dim_sizes[split_dim] < 1) + if (dim_sizes[split_dim] < 1) ERROR("Split Dimension < 1 after concat"); - Tensor* output = (Tensor*) create4DTensor(tensors[0]->data_type, tensors[0]->data_format, - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); - - DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n", - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + Tensor *output = (Tensor *)create4DTensor( + tensors[0]->data_type, tensors[0]->data_format, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); + DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] " + "= %d \n", + dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); int num_copies = 1; - for(unsigned int i = 0; i < split_dim; i++){ + for (unsigned int i = 0; i < split_dim; i++) { num_copies = num_copies * dim_sizes[i]; } - + size_t copy_stride = num_splits * copy_size; - DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, output->size_in_bytes = %d \n", - copy_size, num_copies, copy_stride, output->size_in_bytes); + DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, " + "output->size_in_bytes = %d \n", + copy_size, num_copies, copy_stride, output->size_in_bytes); - for(unsigned int i = 0; i < num_copies; i++){ + for (unsigned int i = 0; i < num_copies; i++) { // FIXIT: Don't be specific to 4D tensors size_t copy_start = i * copy_stride; - - for(int j = 0; j < num_splits; j++){ - struct Tensor* split = tensors[j]; - memcpy(((char*) output->host_data + copy_start + (j * copy_size)), - ((char*) split->host_data + (i * copy_size)), - copy_size); - } + + for (int j = 0; j < num_splits; j++) { + struct Tensor *split = tensors[j]; + memcpy(((char *)output->host_data + copy_start + (j * copy_size)), + ((char *)split->host_data + (i * copy_size)), copy_size); + } } profileEvent("tensorConcat_end", true); @@ -788,15 +592,13 @@ void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){ return output; } +void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha, + double LRN_beta, double LRN_k) { - -void* tensorLRN(void* input_ptr, unsigned int LRN_window, - double LRN_alpha, double LRN_beta, double LRN_k){ - - INFO("*** TensorLRN \n"); + INFO("*** TensorLRN \n"); profileEvent("tensorLRN"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); @@ -804,29 +606,28 @@ void* tensorLRN(void* input_ptr, unsigned int LRN_window, cudnnLRNDescriptor_t LRNDesc; checkCUDNN(cudnnCreateLRNDescriptor(&LRNDesc)); - DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", - LRN_window, LRN_alpha, LRN_beta, LRN_k); - - - checkCUDNN(cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k)); + DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", LRN_window, + LRN_alpha, LRN_beta, LRN_k); + + checkCUDNN( + cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k)); - size_t* dim_sizes = input->dims.dim_sizes; - Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); + size_t *dim_sizes = input->dims.dim_sizes; + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); printTensorDescInfo(input); printTensorDescInfo(output); - - checkCUDNN(cudnnLRNCrossChannelForward(cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, - &alpha, input->tensor_desc, input->gpu_data, - &beta, output->tensor_desc, output->gpu_data)); + + checkCUDNN(cudnnLRNCrossChannelForward( + cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, + input->tensor_desc, input->gpu_data, &beta, output->tensor_desc, + output->gpu_data)); profileEvent("tensorLRN_end", true); - + return output; } - - diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index 079a9898294b01ba8dfcb575f11998790f24abfa..f6bfe700b44c88fea06c6a76267b49af4a523716 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -1,13 +1,12 @@ //===--------------------------- tensor_utils.cu --------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of utility functions // useful for approximated and non-approximated versions of tensor operations. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -42,18 +41,15 @@ #include "global_data.h" #include "fp16_gemm.h" +extern "C" { - -extern "C"{ - - -void freeTensor(void* tensor_ptr){ - Tensor* tensor = (Tensor*) tensor_ptr; +void freeTensor(void *tensor_ptr) { + Tensor *tensor = (Tensor *)tensor_ptr; tensors_ptr.erase(tensor->gpu_data); tensors_ptr.erase(tensor->gpu_half_data); host_ptr.erase(tensor->host_data); - + cudaFree(tensor->gpu_data); tensor->gpu_data = nullptr; cudaFree(tensor->gpu_half_data); @@ -62,43 +58,42 @@ void freeTensor(void* tensor_ptr){ tensor->host_data = nullptr; } - // Returns the size of the target datatype -int getTypeSize(int data_type){ +int getTypeSize(int data_type) { // TODO: Add support for more data types switch (data_type) { - case float_type: - return 4; - case double_type: - return 8; - case half_type: - return 2; - case int_type: - return 1; - case float2_type: - return 8; - case half2_type: - return 4; - default: - ERROR("Unknown type %d\n", data_type); + case float_type: + return 4; + case double_type: + return 8; + case half_type: + return 2; + case int_type: + return 1; + case float2_type: + return 8; + case half2_type: + return 4; + default: + ERROR("Unknown type %d\n", data_type); } return 0; } -static int getFullPrecTypeSize(int data_type){ +static int getFullPrecTypeSize(int data_type) { switch (data_type) { - case float_type: - case half_type: - return 4; - case double_type: - return 8; - case int_type: - return 1; - case float2_type: - case half2_type: - return 8; - default: - ERROR("Unknown type %d\n", data_type); + case float_type: + case half_type: + return 4; + case double_type: + return 8; + case int_type: + return 1; + case float2_type: + case half2_type: + return 8; + default: + ERROR("Unknown type %d\n", data_type); } return 0; } @@ -107,7 +102,7 @@ static bool isFP16Compound(int data_type) { return data_type == half_type || data_type == half2_type; } -void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){ +void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { int type_size = getTypeSize(data_type); size_t size_in_bytes = type_size * num_elems; tensor->size_in_bytes = size_in_bytes; @@ -115,18 +110,20 @@ void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){ DEBUG("***--- size_in_bytes = %d \n", size_in_bytes); } - // NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU) -void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ +void allocateMem(struct Tensor *tensor, int data_type, size_t num_elems) { setSizeInBytes(tensor, data_type, num_elems); tensor->data_type = data_type; - tensor->cur_type = data_type; // type maintained for hanlding FP32 <-> FP16 conversions + tensor->cur_type = + data_type; // type maintained for hanlding FP32 <-> FP16 conversions tensor->num_elems = num_elems; - - size_t size_on_host = num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32 - tensor->host_data = (void*) malloc(size_on_host); // Allocate memory on the host - tensor->data_placement = HOST; // By defaut data is on the host - + + size_t size_on_host = + num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32 + tensor->host_data = + (void *)malloc(size_on_host); // Allocate memory on the host + tensor->data_placement = HOST; // By defaut data is on the host + DEBUG("Attempting to Allocate = %lu \n\n\n", tensor->size_in_bytes); if (isFP16Compound(data_type)) { @@ -142,23 +139,25 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ } tracked_tensors[tensor] = 1; // For FP16-FP32 data handling - + host_ptr.insert(tensor->host_data); obj_ptr.insert(tensor); - //host_ptr.push_back(tensor->host_data); + // host_ptr.push_back(tensor->host_data); } /// Two tensor formats are supported: NCHW and NHWC. /// TODO: Make this more general in the future. /// -void setCudnnDataFormat(struct Tensor* tensor, int data_format){ +void setCudnnDataFormat(struct Tensor *tensor, int data_format) { - switch(data_format){ + switch (data_format) { case 0: - data_format = CUDNN_TENSOR_NCHW; break; + data_format = CUDNN_TENSOR_NCHW; + break; case 1: - data_format = CUDNN_TENSOR_NHWC; break; - + data_format = CUDNN_TENSOR_NHWC; + break; + default: break; } @@ -167,39 +166,31 @@ void setCudnnDataFormat(struct Tensor* tensor, int data_format){ DEBUG("tensor->data_format = %d \n", tensor->data_format); } - -void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size){ +void set4DFilterDescriptor(struct Tensor *tensor, int data_format, + size_t dim1_size, size_t dim2_size, size_t dim3_size, + size_t dim4_size) { setCudnnDataFormat(tensor, data_format); - + checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc)); checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc)); - - checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc, - (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, - (cudnnTensorFormat_t) tensor->data_format, - dim1_size, - dim2_size, - dim3_size, - dim4_size)); - - checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc, - (cudnnDataType_t) CUDNN_DATA_HALF, - (cudnnTensorFormat_t) tensor->data_format, - dim1_size, - dim2_size, - dim3_size, - dim4_size)); + checkCUDNN(cudnnSetFilter4dDescriptor( + tensor->filter_desc, + (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, + (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size, + dim4_size)); + checkCUDNN(cudnnSetFilter4dDescriptor( + tensor->filter_half_desc, (cudnnDataType_t)CUDNN_DATA_HALF, + (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size, + dim4_size)); } - - -void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size){ +void set4DTensorDescriptor(struct Tensor *tensor, int data_format, + size_t dim1_size, size_t dim2_size, size_t dim3_size, + size_t dim4_size) { setCudnnDataFormat(tensor, data_format); @@ -207,292 +198,270 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc)); - // For certain operations, the strides may need to change - in which case the descriptor - // needs to be reinitialized - cudnnSetTensor4dDescriptor(tensor->tensor_desc, - (cudnnTensorFormat_t) tensor->data_format, // Data format - (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, // Data type - dim1_size, dim2_size, - dim3_size, dim4_size); - + // For certain operations, the strides may need to change - in which case the + // descriptor needs to be reinitialized + cudnnSetTensor4dDescriptor( + tensor->tensor_desc, + (cudnnTensorFormat_t)tensor->data_format, // Data format + (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, // Data type + dim1_size, dim2_size, dim3_size, dim4_size); - cudnnSetTensor4dDescriptor(tensor->tensor_half_desc, - (cudnnTensorFormat_t) tensor->data_format, // Data format - (cudnnDataType_t) CUDNN_DATA_HALF, // Data type - dim1_size, dim2_size, - dim3_size, dim4_size); + cudnnSetTensor4dDescriptor( + tensor->tensor_half_desc, + (cudnnTensorFormat_t)tensor->data_format, // Data format + (cudnnDataType_t)CUDNN_DATA_HALF, // Data type + dim1_size, dim2_size, dim3_size, dim4_size); - cudnnDataType_t dType; int nStride, cStride, hStride, wStride; int size1, size2, size3, size4; - cudnnGetTensor4dDescriptor(tensor->tensor_desc, - &dType, - &size1, &size2, &size3, &size4, - &nStride, &cStride, &hStride, &wStride); - - DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", - nStride, cStride, hStride, wStride); -} + cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2, + &size3, &size4, &nStride, &cStride, &hStride, + &wStride); + DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride, + cStride, hStride, wStride); +} // FIXIT: Striding still not working - hence 2D and 3D tensor support is missing -void setTensorDescriptor(struct Tensor* tensor, int num_dims, - size_t* dim_sizes){ +void setTensorDescriptor(struct Tensor *tensor, int num_dims, + size_t *dim_sizes) { checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc)); - int* strides = (int*) malloc(sizeof(int) * num_dims); + int *strides = (int *)malloc(sizeof(int) * num_dims); strides[num_dims - 1] = 1; - for(int i = num_dims - 2; i >= 0; i--){ - strides[i] = strides[i+1] * dim_sizes[i+1]; + for (int i = num_dims - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * dim_sizes[i + 1]; } - for(int i = 0; i < num_dims; i++){ + for (int i = 0; i < num_dims; i++) { DEBUG("strides[%d] = %d \n", i, strides[i]); } - int* const_dims = (int*) malloc(sizeof(int) * num_dims); - for(int j = 0 ; j < num_dims; j++){ - const_dims[j] = (int) dim_sizes[j]; + int *const_dims = (int *)malloc(sizeof(int) * num_dims); + for (int j = 0; j < num_dims; j++) { + const_dims[j] = (int)dim_sizes[j]; DEBUG("const_dim = %d \n", const_dims[j]); } - - DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, CUDNN_DATA_FLOAT); - // For certain operations, the strides may need to change - in which case the descriptor - // needs to be reinitialized - checkCUDNN(cudnnSetTensorNdDescriptor(tensor->tensor_desc, - (cudnnDataType_t) tensor->data_type, // Data type - num_dims, - (const int*) const_dims, - (const int*) strides)); + + DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, + CUDNN_DATA_FLOAT); + // For certain operations, the strides may need to change - in which case the + // descriptor needs to be reinitialized + checkCUDNN(cudnnSetTensorNdDescriptor( + tensor->tensor_desc, + (cudnnDataType_t)tensor->data_type, // Data type + num_dims, (const int *)const_dims, (const int *)strides)); } +/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. -/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. +void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 2); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 2; + return tensor; +} - void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 2); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 2; - - return tensor; - } +void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, + size_t dim3_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 3); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 3; + + return tensor; +} +void *create4DTensor(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, size_t dim4_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + // Done setting tensor dimensions + // setTensorDescriptor(tensor, 4, dim_sizes); + set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, + dim4_size); + // FIXIT: filter descriptor should be invoked only for filters + set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, + dim4_size); + + return tensor; +} - void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, - size_t dim3_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 3); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 3; - - return tensor; - } +void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { + Tensor *tensor = (Tensor *)tensor_ptr; - void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, - size_t dim3_size, size_t dim4_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - // Done setting tensor dimensions - //setTensorDescriptor(tensor, 4, dim_sizes); - set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); - // FIXIT: filter descriptor should be invoked only for filters - set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); - - return tensor; + size_t host_size_in_bytes = tensor->num_elems * 4; + // if(tensor->size_in_bytes != size_in_bytes){ + if (host_size_in_bytes != size_in_bytes) { + ERROR("The destination and source sizes don't match"); } + std::memcpy(tensor->host_data, data_ptr, size_in_bytes); - void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){ + changeTensorPlacement(tensor, HOST); + + tensor->cur_type = float_type; +} - Tensor* tensor = (Tensor*) tensor_ptr; +void hostToDeviceCopy(struct Tensor *tensor) { - size_t host_size_in_bytes = tensor->num_elems * 4; - //if(tensor->size_in_bytes != size_in_bytes){ - if(host_size_in_bytes != size_in_bytes){ - ERROR("The destination and source sizes don't match"); - } - - std::memcpy(tensor->host_data, data_ptr, size_in_bytes); + if (tensor->data_placement != DEVICE) { + cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, + cudaMemcpyHostToDevice); + DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); + tensor->data_placement = DEVICE; + } else { + DEBUG("No data movement required - Data on Device \n"); + } +} - changeTensorPlacement(tensor, HOST); +void deviceToHostCopy(struct Tensor *tensor) { - tensor->cur_type = float_type; + if (tensor->data_placement != HOST) { + cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, + cudaMemcpyDeviceToHost); + DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); + tensor->data_placement = HOST; + } else { + DEBUG("No data movement required - Data on Host \n"); } +} - +// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ - void hostToDeviceCopy(struct Tensor* tensor){ +void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { - if(tensor->data_placement != DEVICE){ - cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, - cudaMemcpyHostToDevice); - DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); - tensor->data_placement = DEVICE; - } - else{ - DEBUG("No data movement required - Data on Device \n"); - } - - } + struct Tensor *srcTensor = (struct Tensor *)srcTensor_ptr; + struct Tensor *dstTensor = (struct Tensor *)dstTensor_ptr; + if (srcTensor->data_placement == HOST) { + memcpy(dstTensor->host_data, srcTensor->host_data, + srcTensor->size_in_bytes); + DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); + dstTensor->data_placement = HOST; + } else if (srcTensor->data_placement == DEVICE) { + cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, + srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice); + DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); + dstTensor->data_placement = DEVICE; + } +} - void deviceToHostCopy(struct Tensor* tensor){ +void hpvm_request_tensor(void *tensor_ptr, int destination) { - if(tensor->data_placement != HOST){ + Tensor *tensor = (Tensor *)tensor_ptr; + // If destination is the host + if (destination == 0) { + if (tensor->data_placement != HOST) { cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; + } else { + DEBUG("No data movement required - Data on Host \n"); } - else{ - DEBUG("No data movement required - Data on Host \n"); - } - - } - - - //void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ - - void tensorCopy(void* srcTensor_ptr, void* dstTensor_ptr){ - - struct Tensor* srcTensor = (struct Tensor*) srcTensor_ptr; - struct Tensor* dstTensor = (struct Tensor*) dstTensor_ptr; - - - if(srcTensor->data_placement == HOST){ - memcpy(dstTensor->host_data, srcTensor->host_data, srcTensor->size_in_bytes); - DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); - dstTensor->data_placement = HOST; - } - else if (srcTensor->data_placement == DEVICE){ - cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes, - cudaMemcpyDeviceToDevice); - DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); - dstTensor->data_placement = DEVICE; - } - } + // If destination is the GPU + else if (destination == 1) { - - void hpvm_request_tensor(void* tensor_ptr, int destination){ - - Tensor* tensor = (Tensor*) tensor_ptr; - // If destination is the host - if(destination == 0){ - if(tensor->data_placement != HOST){ - cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, - cudaMemcpyDeviceToHost); - DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); - tensor->data_placement = HOST; - } - else{ - DEBUG("No data movement required - Data on Host \n"); - } - } - // If destination is the GPU - else if(destination == 1){ - - if(tensor->data_placement != DEVICE){ - cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, - cudaMemcpyHostToDevice); - DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); - tensor->data_placement = DEVICE; - } - else{ - DEBUG("No data movement required - Data on Device \n"); - } + if (tensor->data_placement != DEVICE) { + cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, + cudaMemcpyHostToDevice); + DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); + tensor->data_placement = DEVICE; + } else { + DEBUG("No data movement required - Data on Device \n"); } - } +} +void convertToFP16(struct Tensor *tensor) { - - void convertToFP16(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; - + if (tensor->cur_type == half_type) return; - + DEBUG("ConvertoFP16 \n"); setSizeInBytes(tensor, half_type, tensor->num_elems); size_t size_in_bytes = tensor->size_in_bytes; DEBUG("size_in_bytes = %d \n", size_in_bytes); - - if(tensor->gpu_half_data == NULL) - checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + + if (tensor->gpu_half_data == NULL) + checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, + size_in_bytes)); // Allocate memory on GPU + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_half_data); - f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data); + f2h((float *)tensor->gpu_data, tensor->num_elems, + (half *)tensor->gpu_half_data); - tensor->cur_type = half_type; + tensor->cur_type = half_type; } +void convertToFP32(struct Tensor *tensor) { - -void convertToFP32(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; - + // Need this check for both offline and online profiling path if (tensor->cur_type == float_type) return; - + DEBUG("ConvertoFP32 \n"); - + setSizeInBytes(tensor, float_type, tensor->num_elems); size_t size_in_bytes = tensor->size_in_bytes; - + // If FP32 data array doesn't exist, allocate - if(tensor->gpu_data == NULL){ - checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU + if (tensor->gpu_data == NULL) { + checkCudaErrors( + cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_data); - h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); + h2f((half *)tensor->gpu_half_data, tensor->num_elems, + (float *)tensor->gpu_data); tensor->cur_type = float_type; - } +void convertToFP32_offline(struct Tensor *tensor) { - -void convertToFP32_offline(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; if (tensor->cur_type == half_type) @@ -504,36 +473,36 @@ void convertToFP32_offline(struct Tensor* tensor){ size_t size_in_bytes = tensor->size_in_bytes; // If FP32 data array doesn't exist, allocate - if(tensor->gpu_data == NULL){ - checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU + if (tensor->gpu_data == NULL) { + checkCudaErrors( + cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_data); - h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); + h2f((half *)tensor->gpu_half_data, tensor->num_elems, + (float *)tensor->gpu_data); tensor->cur_type = float_type; - + cudaFree(tensor->gpu_half_data); tensors_ptr.erase(tensor->gpu_half_data); tensor->gpu_half_data = NULL; } - - - - // Called from within the runtime to change the data placement -// This routine is required to change the output data placements from host to device -void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){ +// This routine is required to change the output data placements from host to +// device +void changeTensorPlacement(struct Tensor *tensor, + data_location_t data_placement) { - if(tensor == NULL) + if (tensor == NULL) ERROR("Tensor == NULL"); tensor->data_placement = data_placement; } - } // end of Extern"C" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu index 5cdfdf5a55109fac66a89f544306fbe7b4b9562a..8c77234e2432bd5fe1cde144b031d42273140d42 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu @@ -1,13 +1,13 @@ //===--------------------------- wrapper_runtime.cu -----------------------===// // //===----------------------------------------------------------------------===// -// -// This file contains the implementation of some of the core API to tensor runtime -// so that runtime tuning of approximations can be done on different targets. +// +// This file contains the implementation of some of the core API to tensor +// runtime so that runtime tuning of approximations can be done on different +// targets. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <cstdio> #include <cstdlib> @@ -24,7 +24,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "tensor_utils.h" #include "debug.h" @@ -37,641 +36,580 @@ #include "half_precision_api.h" #include "hpvm-rt-controller.h" -#include "approxhpvm_runtime_utils.h" +#include "approxhpvm_runtime_utils.h" #include "approx_api.h" - -extern "C"{ - - /**** Wrapper Runtime API ***/ - - - void* wrapper_ConvLayer(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for ConvLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Check for convolution as first operation - CUSTOM_ASSERT((ApproxChoices.size() >= 1) && - (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && - "Incorrect number/type of operations in provided Conv layer configuration"); - - void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second, - input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w); - void* add_out; - if (bias != NULL) { - // Check for add as second operation - CUSTOM_ASSERT((ApproxChoices.size() >= 2) && - (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && - "Incorrect number/type of operations in provided Conv layer configuration"); - add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - conv_out, bias); - } else { - add_out = conv_out; - } - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[2].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - - void* pool_out; - - if (pool_size > 0) { - switch (pool_id) { - case 0: - { - // If we remove the asserts, we can have all cases handled by a single call - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && - "Expected POOL_MAX in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - case 1: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && - "Expected POOL_MEAN in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - case 2: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Expected POOL_MIN in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - default: - { - ERROR("Pool id %d NOT supported \n", pool_id); - } - break; - } - } else { - pool_out = activation_out; - } - return pool_out; +extern "C" { + +/**** Wrapper Runtime API ***/ + +void * +wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, + void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h, + int conv_stride_w, int pool_id, int pool_size, + int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for ConvLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Check for convolution as first operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 1) && + (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + + void *conv_out = handleTensorConvApproximationTuples( + ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w); + void *add_out; + if (bias != NULL) { + // Check for add as second operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 2) && + (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + conv_out, bias); + } else { + add_out = conv_out; + } + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided Conv " + "layer configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[2].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + + void *pool_out; + + if (pool_size > 0) { + switch (pool_id) { + case 0: { + // If we remove the asserts, we can have all cases handled by a single + // call + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && + "Expected POOL_MAX in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + case 1: { + CUSTOM_ASSERT( + (ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && + "Expected POOL_MEAN in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + case 2: { + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Expected POOL_MIN in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + default: { + ERROR("Pool id %d NOT supported \n", pool_id); + } break; } - else { - ERROR("Unsupported Configuration"); - abort(); - } - - return NULL; + } else { + pool_out = activation_out; + } + return pool_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} - - - - void* wrapper_ConvLayer2(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, - int pool_size_v, int pool_size_h, - int pool_pad_v, int pool_pad_h, - int pool_stride_v, int pool_stride_h, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - INFO ("*** Conv Layer \n"); - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for ConvLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - - //printf("*** Convolution \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); - - // Check for convolution as first operation - CUSTOM_ASSERT((ApproxChoices.size() >= 1) && - (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && - "Incorrect number/type of operations in provided Conv layer configuration"); - - - - void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second, - input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w); - void* add_out; - if (bias != NULL) { - // Check for add as second operation - CUSTOM_ASSERT((ApproxChoices.size() >= 2) && - (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && - "Incorrect number/type of operations in provided Conv layer configuration"); - add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - conv_out, bias); - } else { - add_out = conv_out; - } - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[2].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - - void* pool_out; - - if (pool_size_v > 0) { - switch (pool_id) { - case 0: - { - // If we remove the asserts, we can have all cases handled by a single call - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && - "Expected POOL_MAX in provided Conv layer configuration"); - - pool_out = handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, - pool_pad_v, pool_pad_h, - pool_stride_v, pool_stride_h); - - - } - break; - case 1: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && - "Expected POOL_MEAN in provided Conv layer configuration"); - - // FIXIT: POOL_MEAN still needs fixing - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, - 0, 0, - pool_size_v, pool_size_h); - - } - break; - case 2: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Expected POOL_MIN in provided Conv layer configuration"); - - // FIXIT: Pool_MEAN needs fixing - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, 0, 0, - pool_size_v, pool_size_h); - } - break; - default: - { - ERROR("Pool id %d NOT supported \n", pool_id); - } - break; - } - } else { - pool_out = activation_out; - } - return pool_out; - } - else { - ERROR("Unsupported Configuration"); - abort(); +void *wrapper_ConvLayer2( + const char *hpvm_node_id, void *input, void *filter, void *bias, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, + int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + INFO("*** Conv Layer \n"); + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for ConvLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // printf("*** Convolution \n ApproxChoice = %d \n BatchNorm = %d \n CONV = + // %d \n", ApproxChoices[0].first, + // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, + // GPUNodeConfiguration::TENSOR_OP::CONV); + + // Check for convolution as first operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 1) && + (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + + void *conv_out = handleTensorConvApproximationTuples( + ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w); + void *add_out; + if (bias != NULL) { + // Check for add as second operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 2) && + (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + conv_out, bias); + } else { + add_out = conv_out; + } + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided Conv " + "layer configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[2].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + + void *pool_out; + + if (pool_size_v > 0) { + switch (pool_id) { + case 0: { + // If we remove the asserts, we can have all cases handled by a single + // call + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && + "Expected POOL_MAX in provided Conv layer configuration"); + + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, pool_pad_v, pool_pad_h, pool_stride_v, pool_stride_h); + + } break; + case 1: { + CUSTOM_ASSERT( + (ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && + "Expected POOL_MEAN in provided Conv layer configuration"); + + // FIXIT: POOL_MEAN still needs fixing + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, 0, 0, pool_size_v, pool_size_h); + + } break; + case 2: { + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Expected POOL_MIN in provided Conv layer configuration"); + + // FIXIT: Pool_MEAN needs fixing + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, 0, 0, pool_size_v, pool_size_h); + } break; + default: { + ERROR("Pool id %d NOT supported \n", pool_id); + } break; } - - return NULL; + } else { + pool_out = activation_out; + } + return pool_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} - - - - - void* wrapper_FCLayer(const char* hpvm_node_id, - void* input, - void* weights, - void* bias, - int activation_id, - // NOTE: out_min and out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - INFO ("*** Dense Layer \n"); - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for FCLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a FC wrapper operation - CUSTOM_ASSERT((ApproxChoices.size() == 2 || ApproxChoices.size() == 3) && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL && - ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD && - "Invalid configuration generated for FC layer wrapper operation"); - - void* gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second, - input, weights); - void* add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - gemm_out, bias); - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - CUSTOM_ASSERT((ApproxChoices.size() == 2) && - "Incorrect number of operations in provided FC layer configuration"); - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[1].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[1].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[1].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - return activation_out; - } - else { - ERROR("Unsupported Configuration"); - abort(); - } - - return NULL; +void * +wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, + void *bias, int activation_id, + // NOTE: out_min and out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + INFO("*** Dense Layer \n"); + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for FCLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Approximation choices must be for a FC wrapper operation + CUSTOM_ASSERT( + (ApproxChoices.size() == 2 || ApproxChoices.size() == 3) && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL && + ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD && + "Invalid configuration generated for FC layer wrapper operation"); + + void *gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second, + input, weights); + void *add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + gemm_out, bias); + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 2) && + "Incorrect number of operations in provided FC layer configuration"); + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[1].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[1].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() == 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[1].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + return activation_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} +void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) { + INFO("*** Relu Operation \n"); - void* wrapper_tensorRelu(const char* hpvm_node_id, void* input_ptr){ - - INFO("*** Relu Operation \n"); - - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a relu operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU && - "Invalid configuration generated for tensor relu wrapper operation"); + // Approximation choices must be for a relu operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU && + "Invalid configuration generated for tensor relu wrapper operation"); - return handleTensorReluApproximationTuples(ApproxChoices[0].second, - input_ptr); - - } + return handleTensorReluApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - void* wrapper_tensorClippedRelu(const char* hpvm_node_id, - void* input_ptr, - float out_min, float out_max){ - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = +void *wrapper_tensorClippedRelu(const char *hpvm_node_id, void *input_ptr, + float out_min, float out_max) { + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a relu operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU && - "Invalid configuration generated for tensor clipped relu wrapper operation"); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - return handleTensorClippedReluApproximationTuples(ApproxChoices[0].second, - input_ptr, out_min, out_max); + // Approximation choices must be for a relu operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU && + "Invalid configuration generated for tensor clipped relu " + "wrapper operation"); - } + return handleTensorClippedReluApproximationTuples( + ApproxChoices[0].second, input_ptr, out_min, out_max); +} - void* wrapper_tensorTanh(const char* hpvm_node_id, void* input_ptr){ - // return tensorTanh(input_ptr); +void *wrapper_tensorTanh(const char *hpvm_node_id, void *input_ptr) { + // return tensorTanh(input_ptr); - GPUNodeConfiguration *GPUConf = + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a tanh operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH && - "Invalid configuration generated for tensor tanh wrapper operation"); - - return handleTensorTanhApproximationTuples(ApproxChoices[0].second, - input_ptr); - - } + // Approximation choices must be for a tanh operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH && + "Invalid configuration generated for tensor tanh wrapper operation"); + return handleTensorTanhApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - void* wrapper_tensorBatchNorm(const char* hpvm_node_id, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr, + void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { - INFO("*** BatchNorm Operation \n"); + INFO("*** BatchNorm Operation \n"); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = - GPUConf->getApproxChoices(); + GPUConf->getApproxChoices(); - // printf("*** BatchNorm \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); + // printf("*** BatchNorm \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d + // \n", ApproxChoices[0].first, + // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, + // GPUNodeConfiguration::TENSOR_OP::CONV); - // Approximation choices must be for a batchnorm operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM && - "Invalid configuration generated for tensor batchnorm wrapper operation"); - - return handleTensorBatchNormApproximationTuples(ApproxChoices[0].second, - input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - - } + // Approximation choices must be for a batchnorm operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM && + "Invalid configuration generated for tensor batchnorm wrapper operation"); + return handleTensorBatchNormApproximationTuples( + ApproxChoices[0].second, input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); +} - void* wrapper_tensorAdd(const char* hpvm_node_id, void* input_ptr, void* bias_ptr){ +void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr, + void *bias_ptr) { - - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = - GPUConf->getApproxChoices(); + GPUConf->getApproxChoices(); - // Approximation choices must be for an add operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD && - "Invalid configuration generated for tensor add wrapper operation"); + // Approximation choices must be for an add operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD && + "Invalid configuration generated for tensor add wrapper operation"); - return handleTensorAddApproximationTuples(ApproxChoices[0].second, - input_ptr, bias_ptr); - - } + return handleTensorAddApproximationTuples(ApproxChoices[0].second, input_ptr, + bias_ptr); +} +void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr, + int poolFunction, int window_height, + int window_width, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride) { - void* wrapper_tensorPooling(const char* hpvm_node_id, - void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ + INFO("*** TensorPooling Operation \n"); - INFO("*** TensorPooling Operation \n"); - - // return tensorPooling(input_ptr, poolFunction, window_height, window_width, - // vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + // return tensorPooling(input_ptr, poolFunction, window_height, window_width, + // vertical_pad, horizontal_pad, vertical_stride, + // horizontal_stride); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - - GPUConf->getApproxChoices(); - - // Approximation choices must be for a single operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - "Invalid configuration generated for tensor pool wrapper operation"); - enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first; - // Approximation choices must be for a pool operation - CUSTOM_ASSERT((top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX || - top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN || - top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Invalid configuration generated for tensor pool wrapper operation"); - - return handleTensorPoolingApproximationTuples(ApproxChoices[0].second, - input_ptr, poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - - } - + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = + + GPUConf->getApproxChoices(); + + // Approximation choices must be for a single operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + "Invalid configuration generated for tensor pool wrapper operation"); + enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first; + // Approximation choices must be for a pool operation + CUSTOM_ASSERT( + (top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX || + top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN || + top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Invalid configuration generated for tensor pool wrapper operation"); + + return handleTensorPoolingApproximationTuples( + ApproxChoices[0].second, input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride); +} - void* wrapper_tensorGroupConvolution(const char* hpvm_node_id, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = +void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input, + void *filter, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a group_conv operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::GROUP_CONV && - "Invalid configuration generated for tensor group_conv wrapper operation"); - - return handleTensorGroupConvApproximationTuples(ApproxChoices[0].second, - input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - - } - - + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Approximation choices must be for a group_conv operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == + GPUNodeConfiguration::TENSOR_OP::GROUP_CONV && + "Invalid configuration generated for tensor group_conv wrapper " + "operation"); + + return handleTensorGroupConvApproximationTuples( + ApproxChoices[0].second, input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, conv_mode, conv_groups); +} - void* wrapper_tensorSoftmax(const char* hpvm_node_id, void* input_ptr){ - // return tensorSoftmax(input_ptr); +void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) { + // return tensorSoftmax(input_ptr); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a softmax operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX && - "Invalid configuration generated for tensor softmax wrapper operation"); - - return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, input_ptr); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + // Approximation choices must be for a softmax operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX && + "Invalid configuration generated for tensor softmax wrapper operation"); - } - - - - void* tensor_set_node_id(unsigned int node_id){ + return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - currentTensorID = node_id; +void *tensor_set_node_id(unsigned int node_id) { - return NULL; - } + currentTensorID = node_id; + return NULL; +} } diff --git a/hpvm/projects/keras/README.md b/hpvm/projects/keras/README.md index 4abb5563fb1e0c0749c9bc67c9d7debe5adce93e..e3cd3b8b19f0df790867f403fbf9a2770c0fee89 100644 --- a/hpvm/projects/keras/README.md +++ b/hpvm/projects/keras/README.md @@ -43,6 +43,18 @@ python setup.py install ``` **NOTE:** This step must be performed each time (for each shell process) the frontend is to be used. + +## Download CNN Model Files + +The weight (model) and data files to use with the CNN benchmarks are hosted on Git LFS and need to separately downloaded. This can be done using: + +``` +git lfs fetch +git lfs checkout +``` + +**NOTE:** Data donwload is necesary before running benchmarks + ## Running Benchmaks Benchmarks under `./src/` diff --git a/hpvm/projects/keras/docs/Support.md b/hpvm/projects/keras/docs/Support.md index a31d012d0bbed679445cacd0760fd7295a8e7088..e5e7b1a1a2125940cd0749e9c957c43bf2205aa3 100644 --- a/hpvm/projects/keras/docs/Support.md +++ b/hpvm/projects/keras/docs/Support.md @@ -1,5 +1,4 @@ - ## Supported Keras Operators The Keras frontend supports `Sequential()` Keras models. @@ -23,7 +22,19 @@ The list of supported operations is as follows: ## Limitations -* We support convolutional neural networks that include the supported operators above - RNNs/LSTMS are not supported +* Currently, we support Convolutional Neural Networks (CNNs) that include the supported operators (above) - RNNs/LSTMs not supported * We currently only support models in NCHW format (NHWC is not supported) * Softmax operator should be the last operation in the CNN pipeline -* Softmax operation must be a separate operator (not specified as activation to another type of Keras operator) +* Softmax operation must be a separate operator (not specified as activation to another type of Keras operator). Example of what works: + +``` +Activation ("softmax") +``` + +Example of what is NOT supported: + +``` +Dense(num_classes, activation="softmax") +``` + + diff --git a/hpvm/projects/pred_tuner/.gitignore b/hpvm/projects/pred_tuner/.gitignore deleted file mode 100644 index 23e6d258015162d516c02fecb0a4f87acf4fb73d..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/.gitignore +++ /dev/null @@ -1,28 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# Jupyter Notebook -.ipynb_checkpoints - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Opentuner -opentuner.db/ -opentuner.log - -# Custom -.idea/ -.vscode/ -/data/ -results/ -tuner_results -tuner_results/ -*.sh -*.ipynb -logistics/ -autotuner/ diff --git a/hpvm/projects/pred_tuner/LICENSE b/hpvm/projects/pred_tuner/LICENSE deleted file mode 100644 index 2e229faa39851c4ddf71b0284c7e56a02dfd577a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 liukuang - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/hpvm/projects/pred_tuner/README.md b/hpvm/projects/pred_tuner/README.md deleted file mode 100644 index 8d7a6db2bdc622e6cac73c56e443e8d3e797133c..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/README.md +++ /dev/null @@ -1,93 +0,0 @@ -# Autotuning with Error-predictive Proxy - -Performs autotuning on program approximation knobs using an error-predictive proxy in place of the original -program, to greatly speedup autotuning while getting results comparable in quality. - -Work in progress. - -## Getting Started - -After finishing this readme, go to [./proxy_tuner.py](./proxy_tuner.py) to try tuning one -model. Use this set of arguments for a start: - -```bash -python proxy_tuner.py --test-limit 1000 --accuracy-drop 1.5 --accuracy-slack 2.1 \ --o tuner_output alexnet2 autotuner/data/alexnet2 -``` - -## Supported Programs & Approximations - -### Programs - -Currently DNN only. Support for several image processing benchmarks are in progress. - -Supported DNNs: - -- `LeNet @ MNIST` - -- `AlexNet @ CIFAR-10` - -- `AlexNet2 @ CIFAR-10` - -- `VGG16 @ CIFAR-10` - -- `ResNet18 @ CIFAR-10` - -- `MobileNet @ CIFAR-10` - -- `VGG16 @ CIFAR-100` - -- `VGG16 @ ImageNet` - -- `ResNet50 @ ImageNet` - -### Approximations - -Currently _hardware-independent_ approximations only. Hardware-reliant approximations are in progress. - -Approximations: (output) perforation for convolution, kernel sampling for convolution. - -## Proxy Model - -TODO: add working principle of proxy modeling. - -## Autotuner - -We use [opentuner](http://opentuner.org/) for autontuning tasks. - -## Project Structure - -### Library - -- `models`: PyTorch definition for DNN models - - - `models/dataset`: Dataset loaders for both HPVM and PyTorch-standard DNN models - - - `models/hpvm`: Definition for HPVM-ported models, with customized convolution layers - -- `toolkit`: core code of project, including DNN indexing / transformations / approximations. See - the code for details. - -### Entry Point - -- `./proxy_tuner.py`: perform autotuning for a given model, accuracy threshold, and a number of iterations, - using a proxy model that predicts the accuracy of approximated DNN (instead of running an inference, which - can be slow). - -- `./run_proxy_tuner.py`: run autotuning for all models defined in `utils/tuner_postprocess/benchmarks.py` on - a set of 3 accuracy thresholds, and perform postprocessing such as computing pareto curve. - - This is the right end-to-end script to use for obtaining a comprehensive set of autotuner results. - -### Other Code - -- `tests`: runnable scripts that can be used as tests (and other actual functionalities) - -- `utils`: helper functions for library and autotuner that are generally standalone, except - - - `utils/utils.py` contains some convenient wrapper for model training, etc. that depends on the library. - -### Data - -- `autotuner/data`: descriptions for each DNN model, such as listing of layers, tunable - knobs, etc. diff --git a/hpvm/projects/pred_tuner/bin/benchmark.py b/hpvm/projects/pred_tuner/bin/benchmark.py deleted file mode 100644 index 92c8b2de5262469d9b752b5a2acd28db55e464a5..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/benchmark.py +++ /dev/null @@ -1,111 +0,0 @@ -import gc -from time import time -from typing import Dict, Iterator, List - -import numpy -from tqdm import tqdm - -from exp import Benchmark, bench_tuner_data -from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, LinearQoSEstimator, ModuleIndexer, \ - NetApproxSelector -from utils import gpu_mem_mb, init_by_name, nn_to_output, tensor_to_accuracy - - -def generate_random_configs(layer_approxes: Dict[int, List[int]], n_configs: int) -> Iterator[ConfigT]: - from numpy.random import choice - from random import randrange - all_layers = [k for k, ns in layer_approxes.items() if ns] - for _ in range(n_configs): - config = {} - n_approx_layers_ = randrange(len(all_layers) + 1) - approx_layers = choice(all_layers, n_approx_layers_, replace=False) - for layer_idx in approx_layers: - config[layer_idx] = choice(layer_approxes[layer_idx], 1)[0] - yield config - - -def time_action(action): - tt0 = time() - action() - tt1 = time() - return tt1 - tt0 - - -def mean_std_str(np_array): - return f"{np_array.mean():.7f} +- {np_array.std():.7f}" - - -def main_loop(bench, baseline_dag, testloader): - _t_baseline_inf = time() - baseline_output = nn_to_output(baseline_dag.module, testloader) - baseline_acc = tensor_to_accuracy(baseline_output, testloader) - print(f"Model accuracy: {baseline_acc}; test set size: {baseline_output.size(0)}") - t_baseline_inf = time() - _t_baseline_inf - nas = NetApproxSelector(baseline_dag) - - def acc_crit(inputs_): - return tensor_to_accuracy(inputs_, testloader) - - def threshold_eval(inputs_): - import numpy as np - accs = np.array([acc_crit(x) for x in inputs_]) - return baseline_acc - accs.mean() < 3.0 - - def run_model(net): - return nn_to_output(net, testloader) - - _t_profile = time() - pickle_path = bench.result_dir / 'proxy.pkl' - f1 = LinearCombEstimator( - nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False - ) - f2 = LinearQoSEstimator( - nas, run_model, acc_crit, threshold_eval, 0.95, independent_init=False - ) - LinearEstimator.coinit_estimators(nas, run_model, threshold_eval, f1, f2, storage=pickle_path) - t_profile = time() - _t_profile - print( - f"Baseline inference time: {t_baseline_inf:.3f} sec, predictor init time: {t_profile:.3f} sec; " - f"Predictor init time is {t_profile / t_baseline_inf:.3f} times of inference time" - ) - configs = generate_random_configs(nas.net_approxes, 30) - pbar = tqdm(configs) - times = [] - for config in pbar: - pbar.set_postfix(mem=gpu_mem_mb()) - approx = nas.apply_approx_by_config(config).module - t_inf = time_action(lambda: nn_to_output(approx, testloader)) - t_f1 = time_action(lambda: f1.estimate(config)) - t_f2 = time_action(lambda: f2.estimate(config)) - pbar.write( - f"Inference time: {t_inf:.3f} sec, predictors time: {t_f1:.3f} | {t_f2:.3f} sec" - ) - times.append([t_inf, t_f1, t_f2]) - gc.collect() - times = numpy.array(times) - s_inf, s0, s1 = numpy.apply_along_axis(mean_std_str, 0, times) - print(f"Result: inference time {s_inf}, predictor time: {s0} | {s1}") - print("Timing raw data:", times) - - -def main(): - for network in ( - 'alexnet_hpvm', 'alexnet2_hpvm', - 'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm', - 'mobilenet_hpvm', - 'resnet18_hpvm', - 'lenet_hpvm', - 'vgg16_imagenet_hpvm', - 'alexnet_imagenet_hpvm', - # 'resnet50_imagenet_hpvm', - ): - bench: Benchmark = bench_tuner_data[network] - print(f"{network}: ") - baseline, testloader, _, shapes = init_by_name(network) - baseline_dag = ModuleIndexer(baseline) - main_loop(bench, baseline_dag, testloader) - gc.collect() - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/discrepancy.py b/hpvm/projects/pred_tuner/bin/discrepancy.py deleted file mode 100644 index 8be92df66ae3a2bcb2d33088bb20064404d37913..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/discrepancy.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from pathlib import Path -from typing import Optional - -import matplotlib.pyplot as plt -import seaborn -import torch -from tqdm import tqdm - -from toolkit import ModuleIndexer, NetApproxSelector, StateCapturer -from utils import device, init_by_name - - -def run_concat_output_at(net_index: ModuleIndexer, testloader, layer: int) -> Optional[torch.Tensor]: - snet = StateCapturer(net_index, lambda i, x: x.clone().detach() if i == layer else None) - for inputs, targets in testloader: - inputs, targets = inputs.to(device), targets.to(device) - snet(inputs) - outputs = snet.net_state[layer] - return torch.cat(outputs) if outputs else None - - -def get_discrepancy_for(baseline, approxed, testloader, changed_layer): - baseline_output = run_concat_output_at(baseline, testloader, changed_layer) - approxed_output = run_concat_output_at(approxed, testloader, changed_layer) - assert baseline_output.shape == approxed_output.shape - tqdm.write(f"{baseline_output.size()}") - diff = baseline_output - approxed_output - diff_rel = torch.abs(diff / baseline_output).cpu() - diff_rel[torch.isnan(diff_rel)] = 0 - diff_rel[diff_rel > 10] = 10 - return diff_rel - - -def main(): - prefix = Path('results/discrepancy/resnet50_imagenet_hpvm') - os.makedirs(prefix, exist_ok=True) - baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm') - net_index = ModuleIndexer(baseline) - nas = NetApproxSelector(net_index) - total = sum(len(ns) for ns in nas.net_approxes.values()) - for layer, approx, approxed_net_dag in tqdm(nas.apply_indep_approx(), total=total): - if approx == 11: - continue - diff_rel = get_discrepancy_for(net_index, approxed_net_dag, testloader, layer) - fig, ax = plt.subplots() - seaborn.heatmap(diff_rel.mean(0).mean(0).numpy(), ax=ax) - fig.savefig((prefix / f'{layer}_{approx}.png').open('wb'), dpi=200) - plt.close(fig) - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/filter_configs.py b/hpvm/projects/pred_tuner/bin/filter_configs.py deleted file mode 100644 index bf23668b81ff0bdf071d27d9e010932ab07e6eea..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/filter_configs.py +++ /dev/null @@ -1,54 +0,0 @@ -from typing import List, Tuple - -from exp import Benchmark, ExpState, bench_tuner_data -from utils.config import Config - - -def filter_configs( - validation: List[Config], test: List[Config], - vali_threshold: float, test_threshold: float = 3.0 -) -> Tuple[List[Config], List[Config]]: - # Filter validation and test set by their respective thresholds - filtered_validation = [ - c for c in validation if c.avg_loss <= vali_threshold - ] - filtered_test = [ - c for c in test if c.avg_loss <= test_threshold - ] - # Test configs also need to be a subset of validation configs. - name_to_filtered = {x.fname: x for x in filtered_test} - intersect_names = set(list(name_to_filtered.keys())).intersection( - set((x.fname for x in filtered_validation)) - ) - filtered_test_ = [name_to_filtered[fname] for fname in intersect_names] - assert set([id(x) for x in filtered_test_]).issubset(set([id(x) for x in filtered_test])) - return filtered_validation, filtered_test_ - - -def process_configs(bench: Benchmark, calib_slack: float, states: ExpState): - validated_configs = states.validated_configs.configs - tested_configs = states.tested_configs.configs - old_len = len(validated_configs) - valid_configs, test_configs = filter_configs( - validated_configs, tested_configs, calib_slack - ) - states.valid_configs.finalize_dump(valid_configs) - states.test_configs.finalize_dump(test_configs) - print(f"{bench.model_name}: {old_len} -> {len(validated_configs)}, {len(tested_configs)}") - # Finalize data input and plot everything. - states.finalize_plot() - - -def main(): - for bench in bench_tuner_data.values(): - bench: Benchmark - try: - states = ExpState(bench) - except ValueError: - print(f"Model {bench.model_name} has incomplete experiment data; skipping") - continue - process_configs(bench, 2.1, states) - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/inferences.py b/hpvm/projects/pred_tuner/bin/inferences.py deleted file mode 100644 index 065abfd223f0a5c234dd36cc8aca7324415ac96f..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/inferences.py +++ /dev/null @@ -1,9 +0,0 @@ -from tqdm import tqdm - -from models import BaselineInfo, networks -from utils import device - -if __name__ == '__main__': - for net_name in networks: - baseline_info = BaselineInfo.init_by_name(net_name, device) - tqdm.write(f"{net_name}: {baseline_info.val_qos} (validation) {baseline_info.test_qos} (test") diff --git a/hpvm/projects/pred_tuner/bin/mock_autotuner.py b/hpvm/projects/pred_tuner/bin/mock_autotuner.py deleted file mode 100644 index ec12e1643ab319e0120f2e95c7801825f04484bb..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/mock_autotuner.py +++ /dev/null @@ -1,230 +0,0 @@ -import gc -import json -import os -from pathlib import Path -from sys import argv -from typing import Dict, Iterable, Iterator, List, Optional, Tuple - -import matplotlib.pyplot as plt -import numpy as np -from tqdm import tqdm, trange - -from exp import Benchmark, bench_tuner_data -from toolkit import ConfigT, LinearCombEstimator, LinearEstimator, \ - LinearQoSEstimator, ModuleIndexer, NetApproxSelector, WeightedLinearCombEstimator -from toolkit.estimators import WeightedLinearQoSEstimator -from utils import config_pylogger, gpu_mem_mb, init_by_name, nn_to_accuracy, nn_to_output, qos_stats, tensor_to_accuracy - -msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True) - - -class Evaluator: - def __init__( - self, nas: NetApproxSelector, n_approx_layers: Optional[int], - n_configs: int, testloader, threshold: Optional[float] - ): - self.nas = nas - self.layer_approxes = nas.net_approxes - self.n_approx_layers = n_approx_layers - self.n_configs = n_configs - self.testloader = testloader - self.threshold = threshold - self.config_accs = None - - def generate_random_configs(self) -> Iterator[ConfigT]: - from numpy.random import choice - from random import randrange - all_layers = [k for k, ns in self.layer_approxes.items() if ns] - for _ in range(self.n_configs): - config = {} - if self.n_approx_layers is None: - n_approx_layers_ = randrange(len(all_layers) + 1) - else: - n_approx_layers_ = min(self.n_approx_layers, len(all_layers)) - approx_layers = choice(all_layers, n_approx_layers_, replace=False) - for layer_idx in approx_layers: - config[layer_idx] = choice(self.layer_approxes[layer_idx], 1)[0] - yield config - - def evaluate_config(self, config: ConfigT) -> Tuple[float, float]: - deterministic = self.nas.is_deterministic(config) - n_runs = 1 if deterministic else 30 - approxed = self.nas.apply_approx_by_config(config).module - accs = [] - for _ in trange(n_runs, leave=None): - acc = nn_to_accuracy(approxed, self.testloader) - accs.append(acc) - mean, confident_acc, _ = qos_stats(accs, 0.95) - return mean, confident_acc - - def sort_configs_by_mean_acc(self): - sorted_ = sorted(self.config_accs, key=lambda p: p[1], reverse=True) - from itertools import takewhile - if self.threshold is not None: - sorted_ = list(takewhile(lambda p: p[1] > self.threshold, sorted_)) - self.config_accs = np.array(sorted_) - - @staticmethod - def calculate_perm_dist(pred_order): - n = len(pred_order) - actual_order = np.arange(n) - return np.linalg.norm(actual_order - pred_order, ord=1) / ((n ** 2 - 1) / 3) - - def use_predictors(self, predictors: Iterable[LinearEstimator]) -> \ - Optional[List[Tuple[np.ndarray, np.ndarray]]]: - self.sort_configs_by_mean_acc() - if len(self.config_accs) == 0: - return None - configs = self.config_accs[:, 0] - raw_prediction = [] - for predictor in predictors: - # N * 2 array: avg acc, 95% confidence acc - pred_accs = np.array([ - predictor.estimate(config) for config in configs - ]) - pred_order = (-pred_accs[:, 0]).argsort(kind='stable') - raw_prediction.append((pred_accs, pred_order)) - return raw_prediction - - def run_configs(self): - configs = self.generate_random_configs() - pbar = tqdm(configs) - config_accs = [] - for config in pbar: - pbar.set_postfix(mem=gpu_mem_mb()) - mean_acc, confident_acc = self.evaluate_config(config) - config_accs.append([config, mean_acc, confident_acc]) - gc.collect() - self.config_accs = np.array(config_accs) - - -class NumpyEncoder(json.JSONEncoder): - def default(self, obj): - if isinstance(obj, np.ndarray): - return obj.tolist() - return json.JSONEncoder.default(self, obj) - - -class DataPlotStorage: - def __init__(self, save_to_prefix: Path): - self.save_to = save_to_prefix - os.makedirs(self.save_to.parent, exist_ok=True) - self.args = [] - self.fig, self.axes = plt.subplots() - - def plot(self, *args, **kwargs): - self.args.append({'args': args, 'kwargs': kwargs}) - self.axes.plot(*args, **kwargs) - - def errorbar(self, *args, **kwargs): - self.args.append({'args': args, 'kwargs': kwargs}) - self.axes.errorbar(*args, **kwargs) - - def save_and_close(self): - self.fig.savefig(self.save_to.with_suffix('.png'), dpi=200) - with self.save_to.with_suffix('.json').open('w') as f: - json.dump(self.args, f, cls=NumpyEncoder) - plt.close(self.fig) - - -def compare_estimators( - eva: Evaluator, predictors: Dict[str, LinearEstimator], n_runs: int, st: DataPlotStorage -): - all_dists = [] - for _ in trange(n_runs): - eva.run_configs() - raw_predictions = eva.use_predictors(predictors.values()) - dists = [eva.calculate_perm_dist(order) for _, order in raw_predictions] - all_dists.append(dists) - dists_t = zip(*all_dists) - for vs, label in zip(dists_t, predictors.keys()): - st.plot(sorted(vs), label=label) - st.axes.set_ylim(bottom=0) - st.fig.legend() - st.save_and_close() - - -def plot_acc_estm_discrepancy( - eva: Evaluator, predictors: Dict[str, LinearEstimator], st: DataPlotStorage -): - eva.run_configs() - raw_predictions = eva.use_predictors(predictors.values()) - if not raw_predictions: - return - measured_mean_accs = eva.config_accs[:, 1] - yerr = measured_mean_accs - eva.config_accs[:, 2] - st.errorbar( - measured_mean_accs, measured_mean_accs, fmt='.', yerr=yerr, uplims=True, label='baseline' - ) - for (pred_accs, _), label in zip(raw_predictions, predictors.keys()): - pred_accs = pred_accs - yerr = pred_accs[:, 0] - pred_accs[:, 1] - st.errorbar( - measured_mean_accs, pred_accs[:, 0], - fmt='.', yerr=yerr, uplims=True, label=label - ) - min_x, max_x = np.min(measured_mean_accs), np.max(measured_mean_accs) - diag_x = np.linspace(min_x, max_x, 500) - st.errorbar(diag_x, diag_x, linewidth=1) - st.axes.set_xlabel('Measured accuracy (%)') - st.axes.set_ylabel('Predicted accuracy (%)') - st.fig.legend() - st.save_and_close() - - -def train_predictors(eva: Evaluator, *predictors: LinearEstimator): - for conf in eva.generate_random_configs(): - for p in predictors: - p.estimate(conf) - - -def main(): - base_path = Path(argv[1]) if len(argv) > 1 else Path('results/mock_autotuner') - - for network in ( - 'alexnet2_hpvm', 'vgg16_cifar10_hpvm', 'vgg16_cifar100_hpvm', - 'mobilenet_hpvm', - 'resnet18_hpvm', - 'vgg16_imagenet_hpvm', 'resnet50_imagenet_hpvm' - ): - bench: Benchmark = bench_tuner_data[network] - print(f"{bench.model_name}: ") - baseline, testloader, _, shapes = init_by_name(bench.model_name) - baseline_dag = ModuleIndexer(baseline) - baseline_acc = nn_to_accuracy(baseline_dag.module, testloader) - nas = NetApproxSelector(baseline_dag) - - def acc_crit(inputs_): - return tensor_to_accuracy(inputs_, testloader) - - def threshold_eval(inputs_): - accs = np.array([acc_crit(x) for x in inputs_]) - return baseline_acc - accs.mean() < 3.0 - - def run_model(net): - return nn_to_output(net, testloader) - - f1 = LinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False) - f2 = LinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False) - f3 = WeightedLinearCombEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False) - f4 = WeightedLinearQoSEstimator(nas, run_model, acc_crit, threshold_eval, 0.95, False) - LinearEstimator.coinit_estimators( - nas, run_model, threshold_eval, f1, f2, f3, f4, - storage=Path('model_params/pickles') / Path(bench.base_dir).name / 'proxy_dev.pkl' - ) - train_predictors(Evaluator(nas, None, 700, testloader, baseline_acc), f3, f4) - st = DataPlotStorage(base_path / "cmp_acc_diff" / f"{bench.model_name}") - plot_acc_estm_discrepancy( - Evaluator(nas, None, 200, testloader, baseline_acc - 10), - {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, st - ) - st = DataPlotStorage(base_path / 'cmp_ordering' / f"{bench.model_name}" / "n_none") - compare_estimators( - Evaluator(nas, None, 20, testloader, None), - {'f1': f1, 'f2': f2, 'f3': f3, 'f4': f4}, 10, st - ) - gc.collect() - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/print_approxes.py b/hpvm/projects/pred_tuner/bin/print_approxes.py deleted file mode 100644 index c95d080326ad2e806d772454c15bed68c573ca17..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/print_approxes.py +++ /dev/null @@ -1,35 +0,0 @@ -from collections import defaultdict - -import matplotlib.pyplot as plt -import pandas as pd -import seaborn -from tqdm import tqdm - -from models.domains import Accuracy -from models import BaselineInfo -from toolkit import NetApproxSelector -from utils import device - - -def main(): - baseline_info = BaselineInfo.init_by_name('mobilenet_hpvm', device) - nas = NetApproxSelector(baseline_info.baseline_net, dev_time_only=True, ignore_fp32=False) - table = defaultdict(dict) - pbar = tqdm(nas.list_single_approxes()) - for layer, approx, _ in pbar: - pbar.set_postfix(k=layer, i=approx) - approxed_net = nas.apply_approx_by_config({layer: approx}).module - acc: Accuracy = baseline_info.get_qos(approxed_net, baseline_info.val_loader) - table[layer][approx] = acc.to_scalar() - df = pd.DataFrame( - [pd.Series(list(d.values()), index=d.keys()) for d in table.values()], - index=list(table.keys()) - ) - with open('accuracy.json', 'w') as f: - df.to_json(f) - seaborn.heatmap(df.to_numpy()) - plt.savefig('accuracy.png', dpi=200) - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/progress_graph.py b/hpvm/projects/pred_tuner/bin/progress_graph.py deleted file mode 100644 index 0d7d0d5526f708e8049e3f185ebceebe68f4b778..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/progress_graph.py +++ /dev/null @@ -1,61 +0,0 @@ -from itertools import groupby -from operator import itemgetter -from pathlib import Path -from typing import Tuple - -import matplotlib.pyplot as plt - -from exp import Benchmark, ExpState, batch_id, bench_tuner_data -from utils import Config - - -def finalize_figs(filename, ax, fig): - ax.legend() - ax.set_ylim(bottom=1.0) - fig.savefig(filename, dpi=200) - plt.close(fig) - - -def process_configs(bench: Benchmark, states: ExpState, shared_ax): - def get_features(c: Config) -> Tuple[int, int, float]: - *_, run_s, iter_s = c.fname.split('_') - return int(run_s), int(iter_s), c.speedup - - def get_max_speedup(group): - group = sorted(list(group), key=itemgetter(1)) - iter_max_speedup = [] - max_speedup = 0 - for _, i, speedup in group: - max_speedup = max(max_speedup, speedup) - iter_max_speedup.append((i, max_speedup)) - return iter_max_speedup - - run_iter_speedup = sorted( - [get_features(c) for c in states.all_configs.configs], key=itemgetter(0) - ) - run_groups = groupby(run_iter_speedup, key=itemgetter(0)) - fig, ax = plt.subplots() - for run, run_group in run_groups: - iter_max_speedup = get_max_speedup(run_group) - iters, max_speedups = zip(*iter_max_speedup) - ax.plot(iters, max_speedups, label=f"loss={run + 1}%") - if run + 1 == 3: - shared_ax.plot(iters, max_speedups, label=f"{bench.model_name.replace('_hpvm', '')}") - finalize_figs(bench.result_dir / f"tuner_progress.png", ax, fig) - - -def main(): - fig, ax = plt.subplots() - for bench in bench_tuner_data.values(): - bench: Benchmark - try: - states = ExpState(bench) - except ValueError: - print(f"Model {bench.model_name} has incomplete experiment data; skipping") - continue - process_configs(bench, states, ax) - finalize_figs(Path("results") / f"{batch_id}_tuner_progress.png", ax, fig) - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/bin/train_model.py b/hpvm/projects/pred_tuner/bin/train_model.py deleted file mode 100644 index d3d0d80725f5784c42ec8f6a26b65ff183df1649..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/bin/train_model.py +++ /dev/null @@ -1,186 +0,0 @@ -"""Train CIFAR10 with PyTorch.""" -import argparse -import os -from typing import List - -import numpy as np -import torch -from torch import optim -from torch.nn import CrossEntropyLoss, Module -from torch.optim.lr_scheduler import ReduceLROnPlateau -from tqdm import tqdm - -from models.torch import ResNet18 -from models.datasets import get_cifar10_train_dataloader, get_cifar10_test_dataloader -from utils import device - - -class RunningStats: - def __init__(self, criterion): - self.criterion = criterion - self.all_outputs = None - self.all_targets = np.zeros([0]) - self.avg_loss, self.correct, self.total = 0, 0, 0 - self.conf_mat = None - self.n_batches = 0 - - @property - def n_classes(self): - if self.all_outputs is None: - raise RuntimeError("Num of classes is unknown before seeing first input") - return self.all_outputs.shape[1] - - def setup_for_first_output(self, outputs): - n_classes = outputs.shape[1] - self.all_outputs = np.zeros([0, n_classes]) - self.conf_mat = np.zeros([n_classes, n_classes]) - - def add_output(self, outputs, targets): - if self.all_outputs is None: - self.setup_for_first_output(outputs) - loss = self.criterion(outputs, targets) - _, predicted = outputs.max(1) - self.avg_loss = (self.avg_loss * self.n_batches + loss.item()) / (self.n_batches + 1) - self.total += targets.size(0) - self.correct += predicted.eq(targets).sum().item() - for t, p in zip(targets, predicted): - self.conf_mat[int(t), p] += 1 - self.n_batches += 1 - outputs = outputs.clone().cpu().detach() - targets = targets.clone().cpu().detach() - self.all_outputs = np.vstack([self.all_outputs, outputs]) - self.all_targets = np.hstack([self.all_targets, targets]) - return loss - - def classwise_outputs(self) -> List[np.ndarray]: - class_outputs = [np.zeros([0, self.n_classes]) for _ in range(self.n_classes)] - for output, label_class in zip(self.all_outputs, self.all_targets): - co = class_outputs[int(label_class)] - class_outputs[int(label_class)] = np.vstack([co, output]) - return class_outputs - - @property - def acc(self): - return 100. * self.correct / self.total - - @property - def classwise_acc(self) -> List[float]: - return [self.conf_mat[i, i] / self.conf_mat[i].sum() for i in range(self.n_classes)] - - -def test(net, testloader, criterion): - net.eval() - rs = RunningStats(criterion) - with torch.no_grad(): - pbar = tqdm(enumerate(testloader), total=len(testloader)) - for batch_idx, (inputs, targets) in pbar: - inputs, targets = inputs.to(device), targets.to(device) - outputs = net(inputs) - rs.add_output(outputs, targets) - pbar.set_postfix_str( - f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})" - ) - return rs - - -def load_torch_checkpoint(net: Module, chpt_path: str): - print('==> Loading checkpoint..') - checkpoint = torch.load(chpt_path) - net.load_state_dict(checkpoint['net']) - start_epoch = checkpoint['epoch'] - return start_epoch - - -def get_optimizer(net, lr): - return optim.SGD(net.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4) - - -class EarlyStopping: - """Early stops the training if validation loss doesn't improve after a given patience.""" - - def __init__(self, path, patience=7, delta=0): - """ - Args: - patience (int): How long to wait after last time validation loss improved. - Default: 7 - delta (float): Minimum change in the monitored quantity to qualify as an improvement. - Default: 0 - path (str): Path for the checkpoint to be saved to. - Default: 'checkpoint.pt' - """ - self.patience = patience - self.counter = 0 - self.min_loss = None - self.delta = delta - self.path = path - - def __call__(self, val_loss, model, epoch): - if self.min_loss is None or val_loss < self.min_loss - self.delta: - # Improved - self.min_loss = val_loss - self.save_checkpoint(model, epoch) - self.counter = 0 - else: - self.counter += 1 - if self.counter >= self.patience: - return True - return False - - def save_checkpoint(self, model, epoch): - tqdm.write('Saving..') - state = { - 'net': model.state_dict(), - 'epoch': epoch, - } - if not os.path.isdir(os.path.dirname(self.path)): - os.makedirs(os.path.dirname(self.path)) - torch.save(state, self.path) - - -def train_one_epoch(net, trainloader, optimizer, criterion): - net.train() - rs = RunningStats(criterion) - pbar = tqdm(trainloader) - for inputs, targets in pbar: - optimizer.zero_grad() - inputs, targets = inputs.to(device), targets.to(device) - outputs = net(inputs) - loss = rs.add_output(outputs, targets) - loss.backward() - optimizer.step() - pbar.set_postfix_str( - f"Loss: {rs.avg_loss:.3f} | Acc: {rs.acc:.3f}% ({rs.correct}/{rs.total})" - ) - - -def train(net, checkpoint, output, lr): - start_epoch = load_torch_checkpoint(net, checkpoint) if checkpoint else 0 - trainloader = get_cifar10_train_dataloader('./data', 128) - testloader = get_cifar10_test_dataloader('./data', 100) - criterion = CrossEntropyLoss() - optimizer = get_optimizer(net, lr) - es = EarlyStopping(output, patience=5) - reduce_lr = ReduceLROnPlateau(optimizer, factor=0.2, patience=3, verbose=True) - for epoch in range(start_epoch + 1, start_epoch + 200): - print('\nEpoch: %d' % epoch) - train_one_epoch(net, trainloader, optimizer, criterion) - rs = test(net, testloader, criterion) - if es(rs.avg_loss, net, epoch): - print(f"Early stopped at {epoch}") - break - reduce_lr.step(rs.avg_loss) - - -def main(): - parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') - parser.add_argument('--lr', default=0.1, type=float, help='learning rate') - parser.add_argument('--resume', '-r', type=str, help='resume from checkpoint') - parser.add_argument( - '--output', '-o', type=str, required=True, help='path to save checkpoint to' - ) - args = parser.parse_args() - train(ResNet18().to(device), args.resume, args.output, args.lr) - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/exp.py b/hpvm/projects/pred_tuner/exp.py deleted file mode 100644 index e7457d5b475d53f7a6c05fcea28f8b1cc4507c93..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/exp.py +++ /dev/null @@ -1,438 +0,0 @@ -import abc -import json -import os -from pathlib import Path -from typing import Dict, Iterable, List, Optional, Tuple, Type - -from torch.nn import Linear, Module -from torch.utils.data import DataLoader - -from models.domains import QoS, qos_stats -from models.hpvm import HPVMConvBundle -from models import BaselineInfo -from toolkit import LinearEstimator, NetApproxSelector -from utils import config_pylogger, get_knob_config_file, get_tensorrt_dir, device -from utils.config import Config, dump_rt_format_to, load_configs_from_dir, plot_configs - -batch_id = "batch405" -is_dev_time = False -ConfigT = Dict[int, int] -msg_logger = config_pylogger(output_dir=Path('tuner_results/logs'), verbose=True) - - -def get_layer_desc(path: Path) -> List[List[str]]: - with path.open() as f: - return [x.split() for x in f] - - -def get_layer_desc_in_pytorch(layer_desc: List[List[str]]) -> \ - Tuple[List[Optional[Module]], Dict[int, int]]: - desc = [] - remapping = {} - for ext_i, vals in enumerate(layer_desc): - if vals and 'conv' == vals[0]: - remapping[ext_i] = len(remapping) - desc.append(HPVMConvBundle) - elif vals and 'dense' == vals[0]: - remapping[ext_i] = len(remapping) - desc.append(Linear) - else: - desc.append(None) - return desc, remapping - - -def read_cost_file(layer_desc: List[List[str]], path: Path) -> List[float]: - with path.open() as f: - raw_costs = [float(x.strip()) for x in f] - costs = [] - raw_cost_it = 0 - for layer in layer_desc: - if 'conv' in layer or 'dense' in layer: - costs.append(raw_costs[raw_cost_it]) - raw_cost_it += 1 - else: - costs.append(0) - assert len(layer_desc) == len(costs) - return costs - - -def read_global_knobs_speedup(path: Path): - knobs_speedup = {} - with path.open() as f: - for x in f: - toks = x.split("\t") - ID = int(toks[0].split(",")[1]) - speedup = float(toks[2]) - knobs_speedup[ID] = speedup - return knobs_speedup - - -class Benchmark: - def __init__(self, json_data: dict): - self.json_data = json_data - self.model_name: str = self.model_name # RHS from json data - # Use baseline configuration as seed to aid the autotuner - # TODO: put this as a field in benchmarks.json - self.use_seed = self.model_name == 'resnet50_imagenet_hpvm' - tensorrt = get_tensorrt_dir() - self.cost_file = tensorrt / self.cost_file - self.layer_file = tensorrt / self.layer_file - self.knobs_config_file = tensorrt / "autotuner/data/global_knobs.txt" - self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id - self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time else "inst_tuner") - - self.layer_desc = get_layer_desc(self.layer_file) - self.pytorch_layer_desc, self.layer_remap = get_layer_desc_in_pytorch(self.layer_desc) - msg_logger.debug(f"HPVM order to neutral order remapping, model {self.model_name}: {self.layer_remap}") - self.layer_costs = read_cost_file(self.layer_desc, self.cost_file) - self.knobs_speedup = read_global_knobs_speedup(get_knob_config_file()) - - def set_batch_id(self, batch_id_: str = batch_id, is_dev_time_: bool = is_dev_time): - tensorrt = get_tensorrt_dir() - self.batch_dir = tensorrt / self.base_dir / "loss_123" / batch_id_ - self.result_dir = self.batch_dir / ("dev_tuner" if is_dev_time_ else "inst_tuner") - - def __getattr__(self, item: str): - return self.json_data[item] - - def translate_config(self, autotuner: ConfigT) -> ConfigT: - ret = {} - for x, v in autotuner.items(): - if x not in self.layer_remap: - assert v == 11 - continue - ret[self.layer_remap[x]] = v - return ret - - def get_baseline_config(self, is_fp16: bool) -> ConfigT: - conf = {} - for layer_id, layer in enumerate(self.pytorch_layer_desc): - knob = 12 if layer is not None and is_fp16 else 11 - conf[layer_id] = knob - return conf - - def pattern_match_layer_knobs(self, module_to_knobs: Dict[Module, List[int]]) -> Dict[int, List[int]]: - conv_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, HPVMConvBundle)] - linear_knobs = [knobs for m, knobs in module_to_knobs.items() if isinstance(m, Linear)] - assert len(conv_knobs) + len(linear_knobs) == len(module_to_knobs) - conv_knobs_idx, linear_knobs_idx = 0, 0 - ret = {} - for layer_id, module_ty in enumerate(self.pytorch_layer_desc): - if module_ty is HPVMConvBundle: - # PROMISE does not apply to first layer of LeNet. - if self.model_name == "lenet_hpvm" and layer_id == 0: - this_conv_knobs = [x for x in conv_knobs[conv_knobs_idx] if x >= 11] - else: - this_conv_knobs = conv_knobs[conv_knobs_idx] - ret[layer_id] = this_conv_knobs + [11] - conv_knobs_idx += 1 - elif module_ty is Linear: - ret[layer_id] = linear_knobs[linear_knobs_idx] + [11] - linear_knobs_idx += 1 - else: - ret[layer_id] = [11] - assert conv_knobs_idx == len(conv_knobs) - return ret - - def compute_config_cost(self, cfg: ConfigT) -> Tuple[float, float]: - orig_cost = 0.0 - total_cost = 0.0 - for layer, knob in cfg.items(): - op_cost = self.layer_costs[layer] - speedup = self.knobs_speedup[knob] - total_cost += (op_cost * 1.0 / speedup * 1.0) - orig_cost += op_cost - speedup = (orig_cost * 1.0) / (total_cost * 1.0) - return total_cost, speedup - - def get_n_layers(self) -> int: - return len(self.layer_desc) - - -class ConfigMeasurer(BaselineInfo): - def __init__( - self, net: Module, val_loader: DataLoader, test_loader: DataLoader, - non_tensor_output: bool, qos_class: Type[QoS], - nas: NetApproxSelector, bench: Benchmark - ): - super().__init__(net, val_loader, test_loader, non_tensor_output, qos_class) - self.nas = nas - self.bench_translate_config = bench.translate_config - self.layer_remap = {k: v for k, v in enumerate(list(self.nas.net_approxes.keys()))} - msg_logger.debug(f"Neutral order to module scanning order remapping: {self.layer_remap}") - self.bench = bench - msg_logger.info( - f"Model {bench.model_name} baseline accuracy = " - f"{self.val_qos} ({self.test_qos} test)" - ) - - def translate_config(self, autotuner_cfg: ConfigT): - autotuner_cfg = self.bench_translate_config(autotuner_cfg) - # Translate layer index from autotuner format (0, 1, 2...) - # to proxy format (actual layer index) - cfg = {self.layer_remap[k]: v for k, v in autotuner_cfg.items() if v != 11} - return cfg - - @classmethod - def init_from_bench(cls, bench: Benchmark) -> 'ConfigMeasurer': - bi = BaselineInfo.init_by_name(bench.model_name, device) - nas = NetApproxSelector(bi.baseline_net, dev_time_only=is_dev_time, ignore_fp32=not is_dev_time) - return cls( - bi.baseline_net, bi.val_loader, bi.test_loader, - bi.non_tensor_output, bi.qos_class, nas, bench - ) - - def proxy_estimate(self, cfg: ConfigT, proxy: LinearEstimator) -> Tuple[QoS, QoS]: - cfg = self.translate_config(cfg) - mean_acc, confident_acc = proxy.estimate(cfg) - return mean_acc, confident_acc - - def actual_measure( - self, cfg: ConfigT, n_runs: int, is_test_set: bool, threshold: QoS = None - ) -> Tuple[QoS, Optional[float]]: - cfg = self.translate_config(cfg) - approx = self.nas.apply_approx_by_config(cfg).module - dataloader = self.test_loader if is_test_set else self.val_loader - from tqdm import trange - qoses = [] - for _ in trange(n_runs, leave=None): - qoses.append(self.get_qos(approx, dataloader)) - mean, _, confidence = qos_stats(qoses, threshold=threshold) - return mean, confidence - - def get_knobs(self): - # Delaying computing knobs because nas can be modified externally (knobs filtered) - ext_layer_to_knobs = self.bench.pattern_match_layer_knobs(self.nas.get_layer_approxes()) - msg_logger.debug(f"Getting knobs:") - for layer, knobs in ext_layer_to_knobs.items(): - msg_logger.debug(f" {layer}: {knobs}") - return ext_layer_to_knobs - - -class PersistentState(abc.ABC): - def __init__(self): - self._substates: Dict[str, PersistentState] = {} - - def __setattr__(self, name, value): - if isinstance(value, PersistentState): - self._substates[name] = value - super().__setattr__(name, value) - - def dump(self): - self._dump_self() - for v in self._substates.values(): - v.dump() - - def load(self): - if self.filled(): - return - try: - self._load_self() - except (ValueError, RuntimeError, FileNotFoundError) as e: - msg_logger.info(f"Exception {e} when loading state") - for k, v in self._substates.items(): - v.load() - - def filled(self): - return self._self_is_initialized() and all((v.filled() for v in self._substates.values())) - - @abc.abstractmethod - def _dump_self(self): - pass - - @abc.abstractmethod - def _load_self(self): - pass - - @abc.abstractmethod - def _self_is_initialized(self) -> bool: - pass - - -class PersistentConfigs(PersistentState): - def __init__(self, bench: Benchmark, prefix: str, baseline_acc: QoS, rt_cpu: bool, rt_gpu: bool): - super().__init__() - self._data = [] - self._filled = False - self.bench = bench - self.prefix = prefix - self.baseline_qos = baseline_acc - self.rt_cpu_path = self.bench.result_dir / f"{prefix}_cpu.txt" if rt_cpu else None - self.rt_gpu_path = self.bench.result_dir / f"{prefix}_fp16.txt" if rt_gpu else None - - @property - def config_folder(self) -> Path: - return self.bench.result_dir / self.prefix - - @property - def configs(self) -> List[Config]: - return self._data - - def _load_self(self): - # Try reading autotuner configs and hpvm-rt configs - self._data = load_configs_from_dir(self.config_folder, self.baseline_qos) - # If hpvm-rt is not present, dump it. - # TODO: check rt format integrity - if ( - (self.rt_cpu_path and not self.rt_cpu_path.is_file()) or - (self.rt_cpu_path and not self.rt_cpu_path.is_file()) - ): - self.finalize_dump() - self._filled = True - - def _dump_self(self): - for conf in self._data: - self._dump_one(conf) - self.finalize_dump() - - def _self_is_initialized(self) -> bool: - return self._filled - - def _dump_one(self, config: Config): - if not self.config_folder.is_dir(): - os.mkdir(self.config_folder.as_posix()) - config_path = self.config_folder / config.fname - with config_path.open('w') as f: - f.write(config.to_tuner_format()) - - def append(self, config: Config): - self._data.append(config) - self._dump_one(config) - - def extend(self, configs: Iterable[Config]): - confs = [] - for conf in configs: - self._dump_one(conf) - confs.append(conf) - self._data.extend(confs) - - def finalize_dump(self, with_configs: Iterable[Config] = None): - if with_configs is not None: - self.extend(with_configs) - self._filled = True - dump_rt_format_to( - self.bench.layer_desc, self._data, self.baseline_qos, - self.rt_cpu_path, self.rt_gpu_path - ) - - -class TuningTime(PersistentState): - def __init__(self, path: Path): - super().__init__() - self.timers = {} - self.path = path - - def _load_self(self): - import re - with self.path.open() as f: - lines = f.readlines() - for line in lines: - line = line.strip() - if not line: - continue - match = re.match(r'Timer ([^=]+) = ([0-9.]+) hours', line) - if not match: - raise RuntimeError(f"File {self.path} malformed") - self.timers[match.group(1)] = float(match.group(2)) - - def _dump_self(self): - for k, v in self.timers.items(): - self._dump_one(k, v) - - def _self_is_initialized(self) -> bool: - return bool(self.timers) - - def _dump_one(self, key: str, value: float): - time_hrs = value / (60 * 60) - msg_logger.info(f"Timer {key} = {time_hrs:.3f} hours") - with self.path.open('a') as f: - f.write(f"Timer {key} = {time_hrs} hours\n") - - def add_timer(self, key: str, value: float): - self.timers[key] = value - self._dump_one(key, value) - - -class AccPair(PersistentState): - def __init__(self, path: Path, qos_class: Type[QoS]): - super().__init__() - self.path = path - self.qos_class = qos_class - self._data = None - - @property - def accs(self) -> Tuple[QoS, QoS]: - if self._data is None: - raise AttributeError("Accuracy not init'ed yet") - return self._data - - @accs.setter - def accs(self, value: Tuple[QoS, QoS]): - self._data = value - self._dump_self() - - def _load_self(self): - with self.path.open() as f: - acc_val, acc_test = [self.qos_class.parse(s) for s in f.read().split('\n')] - self._data = acc_val, acc_test - - def _dump_self(self): - with self.path.open('w') as f: - f.write(f"{self._data[0]}\n{self._data[1]}") - - def _self_is_initialized(self) -> bool: - return self._data is not None - - -class ExpState(PersistentState): - def __init__(self, bench: Benchmark, qos_class: Type[QoS], accs: Tuple[QoS, QoS] = None): - super().__init__() - self.bench = bench - self.baseline_accs = AccPair(bench.result_dir / 'baseline_acc.txt', qos_class) - self.baseline_accs.load() - if not self.baseline_accs.filled(): - if accs is None: - raise ValueError("Provide model baseline accuracy") - self.baseline_accs.accs = accs - acc_val, acc_test = self.baseline_accs.accs - self.all_configs = PersistentConfigs(bench, 'all', acc_val, False, False) - self.filtered_configs = PersistentConfigs(bench, 'filtered', acc_val, False, False) - self.validated_configs = PersistentConfigs(bench, 'validated', acc_val, False, False) - self.tested_configs = PersistentConfigs(bench, 'tested', acc_test, False, False) - self.valid_configs = PersistentConfigs(bench, 'valid', acc_val, True, True) - self.test_configs = PersistentConfigs(bench, 'test', acc_test, True, True) - self.timers = TuningTime(bench.result_dir / 'tuning_time.txt') - super().load() - - def _load_self(self): - pass - - def _dump_self(self): - pass - - def _self_is_initialized(self) -> bool: - return True - - def finalize_plot(self): - if not self.filled(): - raise RuntimeError("Cannot finalize before data slots are all filled") - plot_configs( - self.bench.result_dir / "all_plot.png", - all=self.all_configs.configs - ) - plot_configs( - self.bench.result_dir / "validated_tested_plot.png", - filtered=self.filtered_configs.configs, - validated=self.validated_configs.configs, - tested=self.tested_configs.configs - ) - plot_configs( - self.bench.result_dir / "filtered_plot.png", - valid=self.valid_configs.configs, - test=self.test_configs.configs - ) - - -with (Path(__file__).parent / 'utils/benchmarks.json').open() as f_: - benchmark_data = json.load(f_) -bench_tuner_data = {k: Benchmark(v) for k, v in benchmark_data.items()} diff --git a/hpvm/projects/pred_tuner/model_params b/hpvm/projects/pred_tuner/model_params deleted file mode 120000 index 90aaa403fdbec5110e1c02431a7df3f31fed0dbf..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/model_params +++ /dev/null @@ -1 +0,0 @@ -../hpvm-tensor-rt/model_params \ No newline at end of file diff --git a/hpvm/projects/pred_tuner/models/__init__.py b/hpvm/projects/pred_tuner/models/__init__.py deleted file mode 100644 index 192f4b5bea17503603ba8f1208a22cea78af2897..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .networks import networks -from .inference import get_all_output, move_to_device_recursively, BaselineInfo -from .domains import QoS diff --git a/hpvm/projects/pred_tuner/models/datasets/__init__.py b/hpvm/projects/pred_tuner/models/datasets/__init__.py deleted file mode 100644 index 1a1e35fcea0e29482abbace082f825aac6c8d608..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/datasets/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .hpvm import CIFAR, CIFARImage, HPVMDataset, ImageNet, MNIST -from .torch import get_cifar10_test_dataset, get_cifar10_test_dataloader, get_cifar10_train_dataloader diff --git a/hpvm/projects/pred_tuner/models/datasets/hpvm.py b/hpvm/projects/pred_tuner/models/datasets/hpvm.py deleted file mode 100644 index aa871d89d85493a0c8ad1237ed9e5e8b0b34ac49..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/datasets/hpvm.py +++ /dev/null @@ -1,163 +0,0 @@ -import logging -from pathlib import Path -from typing import Iterator, List, Tuple, TypeVar - -import numpy as np -import torch -from torch.utils.data.dataset import IterableDataset - -from models.hpvm import read_tensor_from_file - -RetT = Tuple[torch.Tensor, torch.Tensor] -T = TypeVar('T', bound='HPVMDataset') -msg_logger = logging.getLogger() - - -class HPVMDataset(IterableDataset): - def __init__(self, inputs: torch.Tensor, outputs: torch.Tensor): - self.inputs, self.outputs = inputs, outputs - - @classmethod - def from_file(cls, *args, **kwargs): - pass - - @property - def sample_input(self): - inputs, outputs = next(iter(self)) - return inputs - - def __len__(self) -> int: - return len(self.inputs) - - def __getitem__(self, idx) -> RetT: - if idx >= len(self): - raise IndexError("Dataset index out of range") - return self.inputs[idx], self.outputs[idx] - - def __iter__(self) -> Iterator[RetT]: - for i in range(len(self)): - yield self[i] - - -class HPVMDNNDataset(HPVMDataset): - @classmethod - def _from_file( - cls, input_file: Path, labels_file: Path, is_uint8_label: bool, - count: int, offset: int, *item_shapes: int - ): - # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC) - channel_size = np.prod(np.array(item_shapes)) - if count != -1: - count *= channel_size - offset *= channel_size - inputs = read_tensor_from_file( - input_file, -1, *item_shapes, count=count, offset=offset, - use_progress_bar=True - ) - label_read_ty = np.int8 if is_uint8_label else np.int32 - labels = read_tensor_from_file( - labels_file, -1, read_ty=label_read_ty, cast_ty=np.long, - count=count, offset=offset - ) - if inputs.size(0) != labels.size(0): - raise ValueError("Input and output have different number of data points") - msg_logger.info(f"{inputs.shape[0]} entries loaded from dataset.") - return cls(inputs, labels) - - @classmethod - def from_default_file(cls, prefix: str): - prefix = Path(prefix) - return cls.from_file( - Path(prefix) / 'input.bin', Path(prefix) / 'labels.bin' - ) - - -class MNIST(HPVMDNNDataset): - @classmethod - def from_file( - cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0 - ): - return cls._from_file( - input_file, labels_file, True, count, offset, 1, 28, 28 - ) - - -class CIFAR(HPVMDNNDataset): - @classmethod - def from_file( - cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0 - ): - return cls._from_file( - input_file, labels_file, True, count, offset, 3, 32, 32 - ) - - -class ImageNet(HPVMDNNDataset): - @classmethod - def from_file( - cls, input_file: Path, labels_file: Path, count: int = -1, offset: int = 0 - ): - return cls._from_file( - input_file, labels_file, False, count, offset, 3, 224, 224 - ) - - -class HPVMImageDataset(HPVMDataset): - @classmethod - def _from_file( - cls, input_file: Path, output_file: Path, - count: int, offset: int, input_shape: List[int], output_shape: List[int] - ): - # NOTE: assuming (N, *) ordering of inputs (such as NCHW, NHWC) - channel_size = np.prod(np.array(input_shape)) - if count != -1: - count *= channel_size - offset *= channel_size - inputs = read_tensor_from_file( - input_file, -1, *input_shape, count=count, offset=offset, - use_progress_bar=True - ) - outputs = read_tensor_from_file( - output_file, -1, *output_shape, count=count, offset=offset, - use_progress_bar=True - ) - print(f"(input={inputs.shape[0]}, output={outputs.shape[0]}) entries loaded from dataset.") - return cls(inputs, outputs) - - @classmethod - def from_default_file(cls, prefix: str): - prefix = Path(prefix) - return cls.from_file( - Path(prefix) / 'input.bin', Path(prefix) / 'canny_input.bin', - Path(prefix) / 'labels.bin', Path(prefix) / 'output.bin' - ) - - -class CIFARImage(HPVMImageDataset): - def __init__( - self, inputs: torch.Tensor, outputs: torch.Tensor, cifar: CIFAR - ): - super().__init__(inputs, outputs) - self.cifar = cifar - - @classmethod - def from_file( - cls, dnn_input_file: Path, image_input_file: Path, - labels_file: Path, output_file: Path, - batch_size: int = 100, count: int = -1, offset: int = 0 - ): - classifier = CIFAR.from_file(dnn_input_file, labels_file) - dataset = HPVMImageDataset._from_file( - image_input_file, output_file, count, offset, - [3, 128, 128], [1, 128, 128] - ) - return cls(dataset.inputs, dataset.outputs, classifier) - - def sample(self: 'CIFARImage', ratio: float) -> 'CIFARImage': - raise NotImplementedError() - - def __getitem__(self, idx): - if idx >= len(self): - raise IndexError("Dataset index out of range") - cifar_in, cifar_out = self.cifar[idx] - return (cifar_in, self.inputs[idx]), (cifar_out, self.outputs[idx]) diff --git a/hpvm/projects/pred_tuner/models/datasets/torch.py b/hpvm/projects/pred_tuner/models/datasets/torch.py deleted file mode 100644 index 1b07bd17c744df733158dc5d84da3f1934e7cd3c..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/datasets/torch.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging - -from torch.utils.data import DataLoader -from torchvision.datasets import CIFAR10 -from torchvision.transforms import transforms - -msg_logger = logging.getLogger() - - -def get_cifar10_train_dataloader(root: str, batchsize: int) -> DataLoader: - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) - dl = DataLoader( - CIFAR10(root=root, train=True, download=True, transform=transform_train), - batch_size=batchsize, shuffle=True - ) - msg_logger.info(f"{len(dl)} entries loaded from training dataset.") - return dl - - -def get_cifar10_test_dataset(root: str) -> CIFAR10: - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) - dataset = CIFAR10(root=root, train=False, download=True, transform=transform_test) - msg_logger.info(f"{len(dataset)} entries loaded from training dataset.") - return dataset - - -def get_cifar10_test_dataloader(root: str, batchsize: int) -> DataLoader: - dl = DataLoader(get_cifar10_test_dataset(root), batch_size=batchsize) - return dl diff --git a/hpvm/projects/pred_tuner/models/domains/__init__.py b/hpvm/projects/pred_tuner/models/domains/__init__.py deleted file mode 100644 index abe6c13a378fe61f9dee7b1c7a60950c1a58226a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/domains/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .qoses import QoS, Accuracy, qos_stats diff --git a/hpvm/projects/pred_tuner/models/domains/qoses.py b/hpvm/projects/pred_tuner/models/domains/qoses.py deleted file mode 100644 index 0a1e7f2eb1050f5adcc4e25d7b65100e3141ae8a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/domains/qoses.py +++ /dev/null @@ -1,317 +0,0 @@ -import abc -from typing import Iterable, List, Optional, Tuple - -import numpy as np -import torch -from torch.utils.data import DataLoader - - -class QoS(abc.ABC): - @abc.abstractmethod - def __sub__(self, other: 'QoS') -> 'QoS': - pass - - @abc.abstractmethod - def __add__(self, other: 'QoS') -> 'QoS': - pass - - @abc.abstractmethod - def __truediv__(self, other: float) -> 'QoS': - pass - - @abc.abstractmethod - def __lt__(self, other: 'QoS') -> bool: - pass - - @abc.abstractmethod - def __eq__(self, other: 'QoS') -> bool: - pass - - def __gt__(self, other: 'QoS') -> bool: - return not self <= other - - def __le__(self, other: 'QoS') -> bool: - return self < other or self == other - - def __ge__(self, other: 'QoS') -> bool: - return not self < other - - @abc.abstractmethod - def __hash__(self): - pass - - @abc.abstractmethod - def __repr__(self) -> str: - pass - - @abc.abstractmethod - def to_scalar(self, relative_to=None) -> float: - pass - - @abc.abstractmethod - def numpy(self) -> np.ndarray: - pass - - @abc.abstractmethod - def null(self) -> 'QoS': - pass - - @staticmethod - @abc.abstractmethod - def parse(string: str) -> 'QoS': - pass - - @abc.abstractmethod - def min_positive_loss(self) -> 'QoS': - pass - - @staticmethod - @abc.abstractmethod - def suggested_tuner_thresholds(baseline: 'QoS') -> List['QoS']: - pass - - @staticmethod - @abc.abstractmethod - def suggested_val_threshold(baseline: 'QoS') -> 'QoS': - pass - - @staticmethod - @abc.abstractmethod - def suggested_test_threshold(baseline: 'QoS') -> 'QoS': - pass - - @staticmethod - @abc.abstractmethod - def from_output(output, ground_truth) -> 'QoS': - pass - - @classmethod - def combine_qoses(cls, qoses: Iterable['QoS']) -> 'QoS': - qoses = np.array(qoses) - return qoses.mean() - - @classmethod - def from_all_output(cls, outputs: List, dataloader: DataLoader) -> 'QoS': - if not outputs: - raise ValueError("Empty output has no QoS value") # Probably can result cls.null() - qoses = [] - for (_, gt_output), output in zip(dataloader, outputs): - qoses.append(cls.from_output(output, gt_output)) - return cls.combine_qoses(qoses) - - -class ScalarQoS(QoS, abc.ABC): - def __init__(self, value: float): - self.value = value - - def __sub__(self, other: 'ScalarQoS') -> 'ScalarQoS': - return self.__class__(self.value - other.value) - - def __add__(self, other: 'ScalarQoS') -> 'ScalarQoS': - return self.__class__(self.value + other.value) - - def __truediv__(self, other: float): - return self.__class__(self.value / other) - - def __lt__(self, other: 'ScalarQoS') -> bool: - return self.value < other.value - - def __eq__(self, other: 'ScalarQoS') -> bool: - return self.value == other.value - - def __hash__(self): - return hash(self.value) - - def __repr__(self) -> str: - return repr(self.value) - - def null(self) -> 'ScalarQoS': - return self.__class__(0.0) - - def to_scalar(self, relative_to=None) -> float: - return self.value - - def numpy(self) -> np.ndarray: - return np.array([self.value]) - - @classmethod - def parse(cls, string: str) -> 'ScalarQoS': - return cls(float(string)) - - -class Accuracy(ScalarQoS): - def __init__(self, accuracy: float): - super().__init__(accuracy) - - def min_positive_loss(self) -> 'Accuracy': - return Accuracy(0.05) if self.value < 0 else self - - @staticmethod - def suggested_tuner_thresholds(baseline: 'Accuracy') -> List['Accuracy']: - return [baseline - Accuracy(0.8), baseline - Accuracy(1.5), baseline - Accuracy(2.1)] - - @staticmethod - def suggested_val_threshold(baseline: 'Accuracy') -> 'Accuracy': - return baseline - Accuracy(2.1) - - @staticmethod - def suggested_test_threshold(baseline: 'Accuracy') -> 'Accuracy': - return baseline - Accuracy(3.0) - - @staticmethod - def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'Accuracy': - ground_truth = ground_truth.to(output.device) - correct = output.argmax(dim=1).eq(ground_truth).sum().item() - acc = correct / ground_truth.shape[0] - return Accuracy(acc * 100) - - -class PSNR(ScalarQoS): - artificial_max = 100 - - def __init__(self, psnr: float): - super().__init__(psnr) - - def min_positive_loss(self) -> 'PSNR': - return PSNR(1) if self.value < 0 else self - - @staticmethod - def suggested_tuner_thresholds(baseline: 'PSNR') -> List['PSNR']: - return [PSNR(30), PSNR(25), PSNR(20)] - - @staticmethod - def suggested_val_threshold(baseline: 'PSNR') -> 'PSNR': - return PSNR(20) - - @staticmethod - def suggested_test_threshold(baseline: 'PSNR') -> 'PSNR': - return PSNR(20) - - @staticmethod - def from_output(output: torch.Tensor, ground_truth: torch.Tensor) -> 'PSNR': - ground_truth = ground_truth.to(output.device) - if ground_truth.shape[0] != 0: - max_i = ground_truth.max() - mse = torch.sum((output - ground_truth) ** 2) / output.nelement() - psnr = (20 * torch.log10(max_i) - 10 * torch.log10(mse)).item() - else: - psnr = PSNR.artificial_max - return PSNR(psnr) - - -class MultiQoS(QoS, abc.ABC): - def __init__(self, *qoses: ScalarQoS): - self.qoses = qoses - - def __sub__(self, other: 'MultiQoS') -> 'MultiQoS': - assert type(self) == type(other) - return self.__class__(*(x - y for x, y in zip(self.qoses, other.qoses))) - - def __add__(self, other: 'MultiQoS') -> 'MultiQoS': - assert type(self) == type(other) - return self.__class__(*(x + y for x, y in zip(self.qoses, other.qoses))) - - def __truediv__(self, other: int): - return self.__class__(*(x / other for x in self.qoses)) - - def __lt__(self, other: 'MultiQoS') -> bool: - assert type(self) == type(other) - return all((x < y for x, y in zip(self.qoses, other.qoses))) - - def __eq__(self, other: 'MultiQoS') -> bool: - assert type(self) == type(other) - return all((x == y for x, y in zip(self.qoses, other.qoses))) - - def __hash__(self): - return hash(self.qoses) - - def __repr__(self) -> str: - return ','.join(repr(q) for q in self.qoses) - - def null(self) -> 'MultiQoS': - return MultiQoS(*(q.null() for q in self.qoses)) - - def numpy(self) -> np.ndarray: - return np.array([q.to_scalar() for q in self.qoses]) - - def min_positive_loss(self) -> 'MultiQoS': - return self.__class__(*(q.min_positive_loss() for q in self.qoses)) - - -PairT = Tuple[torch.Tensor, torch.Tensor] -TripleT = Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - - -class AccuracyPSNR(MultiQoS): - def __init__(self, acc: Accuracy, psnr: PSNR): - super().__init__(acc, psnr) - - def to_scalar(self, relative_to: 'AccuracyPSNR' = None) -> float: - acc, psnr = self.qoses - if relative_to is not None: - thres_acc, thres_psnr = relative_to.qoses - punishment = (-1 if acc < thres_acc else 0) + (-1 if psnr < thres_psnr else 0) - else: - punishment = 0 - max_psnr = PSNR.artificial_max - normed_psnr = min(psnr.value, max_psnr) / max_psnr # [0, 1], higher better - acc = acc.value / 100 # [0, 1], higher better - combined = (acc + normed_psnr) / 2 # [0, 1], higher better - assert 0 <= combined <= 1 - return combined + punishment - - @staticmethod - def parse(string: str) -> 'AccuracyPSNR': - acc, psnr = string.split(',') - return AccuracyPSNR(Accuracy.parse(acc), PSNR.parse(psnr)) - - # noinspection PyTypeChecker - @staticmethod - def suggested_tuner_thresholds(baseline: 'AccuracyPSNR') -> List['AccuracyPSNR']: - ret = [] - for acc in Accuracy.suggested_tuner_thresholds(baseline.qoses[0]): - for psnr in PSNR.suggested_tuner_thresholds(baseline.qoses[1]): - ret.append(AccuracyPSNR(acc, psnr)) - return ret - - # noinspection PyTypeChecker - @staticmethod - def suggested_val_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR': - return AccuracyPSNR( - Accuracy.suggested_val_threshold(baseline.qoses[0]), - PSNR.suggested_val_threshold(baseline.qoses[1]) - ) - - # noinspection PyTypeChecker - @staticmethod - def suggested_test_threshold(baseline: 'AccuracyPSNR') -> 'AccuracyPSNR': - return AccuracyPSNR( - Accuracy.suggested_test_threshold(baseline.qoses[0]), - PSNR.suggested_test_threshold(baseline.qoses[1]) - ) - - @staticmethod - def from_output(output: TripleT, ground_truth: PairT) -> 'AccuracyPSNR': - gt_labels, gt_images = ground_truth - labels, image_selection, images = output - gt_labels = gt_labels.to(labels.device) - gt_images = gt_images.to(images.device) - acc = Accuracy.from_output(labels, gt_labels) - gt_images = gt_images[image_selection] - psnr = PSNR.from_output(images, gt_images) - return AccuracyPSNR(acc, psnr) - - -def qos_stats(qoses: List[QoS], confidence: float = None, threshold: QoS = None) -> \ - Tuple[QoS, Optional[QoS], Optional[float]]: - qoses = np.array(qoses) - n_runs = len(qoses) - confidence_at_thres = np.count_nonzero(qoses > threshold) / n_runs if threshold else None - if confidence is None: - qos_at_confidence = None - else: - index = int((1 - confidence) * n_runs) - # Otherwise it's np.float64 and causes trouble with opentuner - qos_at_confidence = qoses[index] - mean_acc = qoses.mean() - return mean_acc, qos_at_confidence, confidence_at_thres diff --git a/hpvm/projects/pred_tuner/models/hpvm/__init__.py b/hpvm/projects/pred_tuner/models/hpvm/__init__.py deleted file mode 100644 index 337738c0bf41002f910acfb98b9e8073ebc10052..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from .alexnet import AlexNet, AlexNet2, AlexNetImageNet -from .alexnet_canny import AlexNet2Canny -from .layers import HPVMConvBundle, HPVMDNN, HPVMDefaultModule, read_tensor_from_file -from .lenet import LeNet -from .mobilenet import MobileNet -from .resnet import ResNet18, ResNet50 -from .vgg16 import VGG16Cifar10, VGG16Cifar100, VGG16ImageNet diff --git a/hpvm/projects/pred_tuner/models/hpvm/alexnet.py b/hpvm/projects/pred_tuner/models/hpvm/alexnet.py deleted file mode 100644 index b7c9b6c3cae1e86ac699913b3f1d09af28c52705..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/alexnet.py +++ /dev/null @@ -1,49 +0,0 @@ -from torch.nn import Linear, ReLU, Sequential, Tanh - -from .layers import HPVMConvBundle, HPVMDNN - - -class AlexNet(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(3, 64, 11, Tanh, pool_size=2, padding=5), - HPVMConvBundle(64, 192, 5, Tanh, pool_size=2, padding=2), - HPVMConvBundle(192, 384, 3, Tanh, padding=1), - HPVMConvBundle(384, 256, 3, Tanh, padding=1), - HPVMConvBundle(256, 256, 3, Tanh, pool_size=2, padding=1) - ) - linears = Sequential(Linear(4096, 10)) - super().__init__(convs, linears) - - -class AlexNet2(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(3, 32, 3, Tanh, padding=1), - HPVMConvBundle(32, 32, 3, Tanh, pool_size=2, padding=1), - HPVMConvBundle(32, 64, 3, Tanh, padding=1), - HPVMConvBundle(64, 64, 3, Tanh, pool_size=2, padding=1), - HPVMConvBundle(64, 128, 3, Tanh, padding=1), - HPVMConvBundle(128, 128, 3, Tanh, pool_size=2, padding=1) - ) - linears = Sequential(Linear(2048, 10)) - super().__init__(convs, linears) - - -class AlexNetImageNet(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(3, 64, 11, ReLU, padding=2, stride=4, pool_size=3, pool_stride=2), - HPVMConvBundle(64, 192, 5, ReLU, padding=2, pool_size=3, pool_stride=2), - HPVMConvBundle(192, 384, 3, ReLU, padding=1), - HPVMConvBundle(384, 256, 3, ReLU, padding=1), - HPVMConvBundle(256, 256, 3, ReLU, padding=1, pool_size=3, pool_stride=2) - ) - linears = Sequential( - Linear(9216, 4096), - ReLU(), - Linear(4096, 4096), - ReLU(), - Linear(4096, 1000), - ) - super().__init__(convs, linears) diff --git a/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py b/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py deleted file mode 100644 index 5e610279121a5b368f4cdf64b72e0a2d6fe9289a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/alexnet_canny.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import Iterable, Tuple - -import torch -from torch.nn import Softmax - -from .alexnet import AlexNet2 -from .layers import HPVMConvBundle, HPVMDefaultModule, ReduceKind, TensorReduce - - -class AlexNet2Canny(HPVMDefaultModule): - def __init__(self, on_classes: Iterable[int]): - super().__init__() - prototype = AlexNet2() - self.on_classes = list(on_classes) - self.convs = prototype.convs - self.linears = prototype.linears - self.softmax = Softmax(1) - self.reduce_1 = TensorReduce(1, ReduceKind.sum) - self.gaussian = HPVMConvBundle(1, 1, 5, padding=2, bias=False) - self.sobel_x = HPVMConvBundle(1, 1, 3, padding=1, bias=False) - self.sobel_y = HPVMConvBundle(1, 1, 3, padding=1, bias=False) - self.reduce_2 = TensorReduce(2, ReduceKind.max) - self.reduce_3 = TensorReduce(2, ReduceKind.max) - - def canny(self, images: torch.Tensor) -> torch.Tensor: - assert len(images.shape) == 4 # Assuming NCHW - grayscale = self.reduce_1(images) - grayscale = grayscale.unsqueeze(1) - denoised = self.gaussian(grayscale) - grad_x = self.sobel_x(denoised) - grad_y = self.sobel_y(denoised) - grad_mag = torch.sqrt(grad_x ** 2 + grad_y ** 2) - grad_max_1D = self.reduce_2(grad_mag) - grad_max = self.reduce_3(grad_max_1D) - grad_max = grad_max.unsqueeze(2).unsqueeze(3) - grad_mag_norm = grad_mag / grad_max - return grad_mag_norm - - def forward(self, inputs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - from functools import reduce - from operator import ior - dnn_input, canny_input = inputs - conv_outputs = self.convs(dnn_input) - dnn_outputs = self.softmax(self.linears(conv_outputs.view(conv_outputs.shape[0], -1))) - classes = dnn_outputs.argmax(dim=1) - selection = reduce(ior, (classes == i for i in self.on_classes)) - selected_inputs = canny_input[selection] - return dnn_outputs, selection, self.canny(selected_inputs) diff --git a/hpvm/projects/pred_tuner/models/hpvm/layers.py b/hpvm/projects/pred_tuner/models/hpvm/layers.py deleted file mode 100644 index fed66e7b1507ac4ca309de0dc0599dde9a926a8a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/layers.py +++ /dev/null @@ -1,223 +0,0 @@ -from enum import Enum -from pathlib import Path -from typing import Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, MaxPool2d, Module, Parameter, ReLU, Sequential, Softmax, \ - Tanh - - -def rsetattr(obj, attr, val): - pre, _, post = attr.rpartition('.') - return setattr(rgetattr(obj, pre) if pre else obj, post, val) - - -def rgetattr(obj, attr, *args): - def _getattr(obj_, attr_): - return getattr(obj_, attr_, *args) - - import functools - return functools.reduce(_getattr, attr.split('.'), obj) - - -def read_tensor_from_file( - filename: Union[str, Path], *shape: int, - read_ty=np.float32, cast_ty=np.float32, - count: int = -1, offset: int = 0, - use_progress_bar: bool = False -) -> torch.Tensor: - from tqdm import trange - block_size = 102400 - offset = offset * read_ty().itemsize - mmap = np.memmap(filename, dtype=read_ty, mode='r', offset=offset) - raw = np.empty_like(mmap) - n_entries = min(mmap.shape[0], count) if count != -1 else mmap.shape[0] - n_blocks = int(np.ceil(n_entries / block_size)) - iterable = trange(n_blocks) if use_progress_bar else range(n_blocks) - for block in iterable: - l, r = block * block_size, min(n_entries, (block + 1) * block_size) - raw[l:r] = mmap[l:r] - del mmap - if cast_ty != read_ty: - raw = raw.astype(cast_ty) - loaded_np = raw.reshape(shape) - return torch.from_numpy(loaded_np) - - -ActivT = Optional[Callable[[], Module]] -ArgsT = Union[List, Dict] -RangeT = Tuple[float, float] -RangeOT = Optional[RangeT] - - -class HPVMConvBundle(Module): - def __init__( - self, in_channels: int, out_channels: int, kernel_size: int, - activation: ActivT = None, - pool_size: Optional[int] = None, pool_stride: Optional[int] = None, - **conv_kwargs - ): - super().__init__() - self.conv = Conv2d(in_channels, out_channels, kernel_size, **conv_kwargs) - if pool_size is None: - self.pooling = Sequential() - else: - pool_stride = pool_stride or pool_size - self.pooling = MaxPool2d(pool_size, stride=pool_stride) - self.activation = Sequential() if activation is None else activation() - self.conv_ranges_ = None - - def forward(self, input_: torch.Tensor) -> torch.Tensor: - return self.activation(self.pooling(self.conv(input_))) - - def input_to_conv(self, input_: torch.Tensor) -> torch.Tensor: - bias = self.conv.bias - self.conv.bias = None - conv_out = self.conv(input_) - self.conv.bias = bias - return conv_out - - def conv_to_output(self, conv_output: torch.Tensor) -> torch.Tensor: - if self.conv.bias is not None: - broadcast_bias = self.conv.bias.reshape(1, -1, 1, 1) - return self.activation(self.pooling(conv_output + broadcast_bias)) - else: - return self.activation(self.pooling(conv_output)) - - def __getattr__(self, item): - if item in ('weight', 'bias'): - return getattr(self.conv, item) - return super(HPVMConvBundle, self).__getattr__(item) - - def __setattr__(self, key, value): - if key in ('weight', 'bias'): - setattr(self.conv, key, value) - else: - super(HPVMConvBundle, self).__setattr__(key, value) - - -class ReduceKind(Enum): - sum = 1 - max = 2 - - -class TensorReduce(Module): - def __init__(self, dim: int, kind: ReduceKind, skip_ratio: float = 0.0): - super().__init__() - self.dim = dim - self.skip_ratio = skip_ratio - if kind == ReduceKind.sum: - self.reducer = lambda x: x.sum(dim=0) # Because we transpose the input - self.normalizer = lambda x: x / (1 - self.skip_ratio) - elif kind == ReduceKind.max: - self.reducer = lambda x: x.max(dim=0)[0] - self.normalizer = lambda x: x - - def forward(self, inputs: torch.Tensor) -> torch.Tensor: - from math import ceil - inputs_t = inputs.transpose(0, self.dim) - if len(inputs) == 0: - dim_reduced = torch.zeros_like(inputs_t)[0] - else: - reduce_dim_size = inputs_t.size(0) - approxed_dim_size = int(ceil((1 - self.skip_ratio) * reduce_dim_size)) - # Take a contiguous chunk and reduce over it, ignore the rest - dim_reduced: torch.Tensor = self.normalizer(self.reducer(inputs_t[:approxed_dim_size])) - return dim_reduced.unsqueeze(0).transpose(0, self.dim).squeeze(self.dim) - - def change_skip_ratio(self, skip_ratio: float) -> 'TensorReduce': - return TensorReduce(self.dim, self.kind, skip_ratio) - - -def read_quant_ranges(prefix: Path): - range_file = prefix / 'quant_ranges.txt' - if not range_file.is_file(): - return None - with range_file.open() as f: - return [[float(field) for field in line.strip().split()] for line in f.readlines()] - - -class HPVMDefaultModule(Module): - @staticmethod - def load_into_layer( - layer: Module, attr_name: str, filename: str, prefix: Path, - is_linear_weight: bool = False - ): - tensor = rgetattr(layer, attr_name) - if is_linear_weight: - n_out, n_in = tensor.shape - loaded = read_tensor_from_file(prefix / filename, n_in, n_out).T - else: - loaded = read_tensor_from_file(prefix / filename, *tensor.shape) - if type(tensor) is Parameter: - loaded = Parameter(loaded, requires_grad=True) - rsetattr(layer, attr_name, loaded) - - @staticmethod - def install_quant_range(module: Module, values: List[float]): - in_min, in_max, w_min, w_max, b_min, b_max, out_min, out_max = values - module.conv_ranges = (in_min, in_max), (w_min, w_max), (b_min, b_max), (out_min, out_max) - - def default_load_hpvm_weights(self, prefix: str): - # TODO: this is probably better done with help of ModuleDAG - prefix = Path(prefix) - convs, group_convs, linears, bns = [], [], [], [] - weightless_types = AvgPool2d, MaxPool2d, ReLU, Tanh, Softmax, TensorReduce - container_types = (Sequential,) - for module in self.modules(): - if isinstance(module, HPVMConvBundle): - convs.append(module) - elif isinstance(module, Conv2d): - if module.groups != 1: - group_convs.append(module) - elif isinstance(module, Linear): - linears.append(module) - elif isinstance(module, BatchNorm2d): - bns.append(module) - elif type(module) in weightless_types: - pass - elif type(module) in container_types or len(list(module.children())) != 0: - continue - else: - raise RuntimeError(f"Layer type {type(module)} not understood") - load = self.load_into_layer - quant_ranges = read_quant_ranges(prefix) - quant_ranges_idx = 0 - for i, conv in enumerate(convs): - conv: HPVMConvBundle - load(conv, 'weight', f"conv2d_{i + 1}_w.bin", prefix) - if conv.bias is not None: - load(conv, 'bias', f"conv2d_{i + 1}_b.bin", prefix) - if quant_ranges is not None: - self.install_quant_range(conv, quant_ranges[quant_ranges_idx]) - quant_ranges_idx += 1 - for i, gconv in enumerate(group_convs): - load(gconv, 'weight', f"depthwise_conv2d_{i + 1}_w.bin", prefix) - if gconv.bias is not None: - load(gconv, 'bias', f"depthwise_conv2d_{i + 1}_b.bin", prefix) - for i, bn in enumerate(bns): - bn: BatchNorm2d - load(bn, 'weight', f"batch_normalization_{i + 1}_gamma.bin", prefix) - load(bn, 'bias', f"batch_normalization_{i + 1}_beta.bin", prefix) - load(bn, 'running_mean', f"batch_normalization_{i + 1}_mean.bin", prefix) - load(bn, 'running_var', f"batch_normalization_{i + 1}_variance.bin", prefix) - for i, linear in enumerate(linears): - load(linear, 'weight', f"dense_{i + 1}_w.bin", prefix, True) - load(linear, 'bias', f"dense_{i + 1}_b.bin", prefix) - if quant_ranges is not None: - self.install_quant_range(linear, quant_ranges[quant_ranges_idx]) - quant_ranges_idx += 1 - assert quant_ranges is None or len(quant_ranges) == quant_ranges_idx - - -class HPVMDNN(HPVMDefaultModule): - def __init__(self, convs: Sequential, linears: Sequential): - super().__init__() - self.convs = convs - self.linears = linears - self.softmax = Softmax(1) - - def forward(self, inputs: torch.Tensor) -> torch.Tensor: - outputs = self.convs(inputs) - return self.softmax(self.linears(outputs.view(outputs.shape[0], -1))) diff --git a/hpvm/projects/pred_tuner/models/hpvm/lenet.py b/hpvm/projects/pred_tuner/models/hpvm/lenet.py deleted file mode 100644 index 0802b5f78d2c73d352afe68b16df74689e9aec68..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/lenet.py +++ /dev/null @@ -1,16 +0,0 @@ -from torch.nn import Linear, Sequential, Tanh - -from .layers import HPVMConvBundle, HPVMDNN - - -class LeNet(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(1, 32, 5, Tanh, 2, padding=2), - HPVMConvBundle(32, 64, 5, Tanh, 2, padding=2) - ) - linears = Sequential( - Linear(7 * 7 * 64, 1024), Tanh(), - Linear(1024, 10), Tanh() - ) - super().__init__(convs, linears) diff --git a/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py b/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py deleted file mode 100644 index f48a214fc9c1d7ec52cd5a24ec0e8d82d38aaa6e..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/mobilenet.py +++ /dev/null @@ -1,45 +0,0 @@ -from torch.nn import AvgPool2d, BatchNorm2d, Conv2d, Linear, ReLU, Sequential - -from .layers import HPVMDNN, HPVMConvBundle - - -def _make_seq(in_channels, out_channels, c_kernel_size, gc_stride, gc_kernel_size=3): - return Sequential( - HPVMConvBundle( - in_channels, out_channels, c_kernel_size, - bias=False, padding=(c_kernel_size - 1) // 2 - ), - BatchNorm2d(out_channels, eps=0.001), - ReLU(), - Conv2d( - out_channels, out_channels, gc_kernel_size, - bias=False, stride=gc_stride, padding=(gc_kernel_size - 1) // 2, groups=out_channels - ), - BatchNorm2d(out_channels, eps=0.001), - ReLU() - ) - - -class MobileNet(HPVMDNN): - def __init__(self): - convs = Sequential( - _make_seq(3, 32, 3, 1), - _make_seq(32, 64, 1, 2), - _make_seq(64, 128, 1, 1), - _make_seq(128, 128, 1, 2), - _make_seq(128, 256, 1, 1), - _make_seq(256, 256, 1, 2), - _make_seq(256, 512, 1, 1), - _make_seq(512, 512, 1, 1), - _make_seq(512, 512, 1, 1), - _make_seq(512, 512, 1, 1), - _make_seq(512, 512, 1, 1), - _make_seq(512, 512, 1, 2), - _make_seq(512, 1024, 1, 1), - HPVMConvBundle(1024, 1024, 1, padding=0, bias=False), - BatchNorm2d(1024, eps=0.001), - ReLU(), - AvgPool2d(2) - ) - linears = Sequential(Linear(1024, 10)) - super().__init__(convs, linears) diff --git a/hpvm/projects/pred_tuner/models/hpvm/resnet.py b/hpvm/projects/pred_tuner/models/hpvm/resnet.py deleted file mode 100644 index fc42a00001792b59b593b668f6cf4e8a5a230d9d..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/resnet.py +++ /dev/null @@ -1,96 +0,0 @@ -from torch.nn import AvgPool2d, BatchNorm2d, Linear, Module, ReLU, Sequential - -from .layers import HPVMConvBundle, HPVMDNN - - -class BasicBlock(Module): - def __init__(self, ins, outs, shortcut=False): - super().__init__() - stride = 2 if shortcut else 1 - self.mainline = Sequential( - HPVMConvBundle(ins, outs, 3, ReLU, padding=1, stride=stride), - HPVMConvBundle(outs, outs, 3, padding=1) - ) - self.relu1 = ReLU() - self.shortcut = HPVMConvBundle(ins, outs, 1, stride=stride) \ - if shortcut else Sequential() - - def forward(self, input_): - return self.relu1(self.mainline(input_) + self.shortcut(input_)) - - -class ResNet18(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(3, 16, 3, ReLU, padding=1), - BasicBlock(16, 16), - BasicBlock(16, 16), - BasicBlock(16, 16), - BasicBlock(16, 32, True), - BasicBlock(32, 32), - BasicBlock(32, 32), - BasicBlock(32, 64, True), - BasicBlock(64, 64), - BasicBlock(64, 64), - AvgPool2d(8) - ) - linears = Sequential(Linear(64, 10)) - super().__init__(convs, linears) - - -class Bottleneck(Module): - expansion = 4 - - def __init__(self, in_planes, planes, stride=1): - super(Bottleneck, self).__init__() - self.mainline = Sequential( - HPVMConvBundle(in_planes, planes, 1, stride=stride), - BatchNorm2d(planes, eps=0.001), - ReLU(), - HPVMConvBundle(planes, planes, 3, padding=1), - BatchNorm2d(planes, eps=0.001), - ReLU(), - HPVMConvBundle(planes, self.expansion * planes, 1), - BatchNorm2d(self.expansion * planes, eps=0.001) - ) - self.relu1 = ReLU() - if stride != 1 or in_planes != self.expansion * planes: - self.shortcut = Sequential( - HPVMConvBundle(in_planes, self.expansion * planes, 1, stride=stride), - BatchNorm2d(self.expansion * planes, eps=0.001) - ) - else: - self.shortcut = Sequential() - - def forward(self, input_): - return self.relu1(self.mainline(input_) + self.shortcut(input_)) - - -class ResNet50(HPVMDNN): - def __init__(self): - convs = Sequential( - HPVMConvBundle(3, 64, 7, ReLU, pool_size=3, pool_stride=2, padding=3, stride=2), - BatchNorm2d(64, eps=0.001), - Bottleneck(64, 64), - Bottleneck(256, 64), - Bottleneck(256, 64), - - Bottleneck(256, 128, stride=2), - Bottleneck(512, 128), - Bottleneck(512, 128), - Bottleneck(512, 128), - - Bottleneck(512, 256, stride=2), - Bottleneck(1024, 256), - Bottleneck(1024, 256), - Bottleneck(1024, 256), - Bottleneck(1024, 256), - Bottleneck(1024, 256), - - Bottleneck(1024, 512, stride=2), - Bottleneck(2048, 512), - Bottleneck(2048, 512), - AvgPool2d(7) - ) - linears = Sequential(Linear(2048, 1000)) - super().__init__(convs, linears) diff --git a/hpvm/projects/pred_tuner/models/hpvm/vgg16.py b/hpvm/projects/pred_tuner/models/hpvm/vgg16.py deleted file mode 100644 index b31c0d47ca43118cbc1f7ad43b517d6dc02dd223..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/hpvm/vgg16.py +++ /dev/null @@ -1,44 +0,0 @@ -from typing import Iterable - -from torch.nn import Linear, ReLU, Sequential - -from .layers import HPVMConvBundle, HPVMDNN - - -class _VGG16(HPVMDNN): - def __init__(self, linear_inouts: Iterable[int]): - convs = Sequential( - HPVMConvBundle(3, 64, 3, ReLU, padding=1), - HPVMConvBundle(64, 64, 3, ReLU, 2, padding=1), - HPVMConvBundle(64, 128, 3, ReLU, padding=1), - HPVMConvBundle(128, 128, 3, ReLU, 2, padding=1), - HPVMConvBundle(128, 256, 3, ReLU, padding=1), - HPVMConvBundle(256, 256, 3, ReLU, padding=1), - HPVMConvBundle(256, 256, 3, ReLU, 2, padding=1), - HPVMConvBundle(256, 512, 3, ReLU, padding=1), - HPVMConvBundle(512, 512, 3, ReLU, padding=1), - HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1), - HPVMConvBundle(512, 512, 3, ReLU, padding=1), - HPVMConvBundle(512, 512, 3, ReLU, padding=1), - HPVMConvBundle(512, 512, 3, ReLU, 2, padding=1) - ) - linear_layers = [Linear(in_, out) for in_, out in zip(linear_inouts, linear_inouts[1:])] - linear_relus = [ReLU() for _ in range(2 * len(linear_layers) - 1)] - linear_relus[::2] = linear_layers - linears = Sequential(*linear_relus) - super().__init__(convs, linears) - - -class VGG16Cifar10(_VGG16): - def __init__(self): - super().__init__([512, 512, 10]) - - -class VGG16Cifar100(_VGG16): - def __init__(self): - super().__init__([512, 512, 100]) - - -class VGG16ImageNet(_VGG16): - def __init__(self): - super().__init__([25088, 4096, 4096, 1000]) diff --git a/hpvm/projects/pred_tuner/models/inference.py b/hpvm/projects/pred_tuner/models/inference.py deleted file mode 100644 index d797e9e605d8c3363d20f09fb52eb4a78195a9ac..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/inference.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging -from typing import Type, Union - -import torch -from torch.nn import Module -from torch.utils.data import DataLoader, IterableDataset, Subset - -from .domains import QoS -from .hpvm import HPVMDNN, HPVMDefaultModule -from .networks import networks - -msg_logger = logging.getLogger(__name__) - - -def move_to_device_recursively(data: object, device_: Union[torch.device, str]): - if isinstance(data, torch.Tensor): - return data.to(device_) - if not hasattr(data, '__dict__'): - if isinstance(data, list): - return [move_to_device_recursively(x, device_) for x in data] - elif isinstance(data, tuple): - return tuple([move_to_device_recursively(x, device_) for x in data]) - else: - raise RuntimeError(f"Don't know how to manipulate {type(data)}") - for key, value in data.__dict__.items(): - data.__dict__[key] = move_to_device_recursively(value, device_) - return data - - -def _infer_net_device(net: Module): - return next(iter(net.parameters())).device - - -def get_all_output(net: Module, dataloader: DataLoader): - outputs = [] - device = _infer_net_device(net) - with torch.no_grad(): - for inputs, targets in dataloader: - inputs = move_to_device_recursively(inputs, device) - outputs.append(net(inputs)) - return outputs - - -def load_torch_checkpoint(net: Module, chpt_path: str): - msg_logger.info('==> Loading checkpoint..') - checkpoint = torch.load(chpt_path) - net.load_state_dict(checkpoint.pop('net')) - return checkpoint - - -class BaselineInfo: - def __init__( - self, net: Module, val_loader: DataLoader, test_loader: DataLoader, - non_tensor_output: bool, qos_class: Type[QoS] - ): - self.baseline_net = net - self.val_loader = val_loader - self.test_loader = test_loader - self.non_tensor_output = non_tensor_output - self.qos_class = qos_class - self.val_qos = self.get_qos(net, val_loader) - self.test_qos = self.get_qos(net, test_loader) - - def get_qos(self, net: Module, dataloader: DataLoader): - return self.qos_class.from_all_output(get_all_output(net, dataloader), dataloader) - - @staticmethod - def _split_dataset(dataset: IterableDataset, split_at: int): - return Subset(dataset, torch.arange(0, split_at)), \ - Subset(dataset, torch.arange(split_at, len(dataset))) - - @classmethod - def init_by_name(cls, model_name: str, device) -> 'BaselineInfo': - msg_logger.info('==> Building model..') - network_factory, dataset_factory, batchsize, prefix, qos_class = networks[model_name] - net = network_factory() - # 1. Load network weights - msg_logger.info('==> Loading checkpoint..') - if isinstance(net, HPVMDefaultModule): - net.default_load_hpvm_weights(prefix) - else: - load_torch_checkpoint(net, prefix) - net = net.eval().to(device) - # 2. Load dataset - msg_logger.info('==> Loading dataset...') - if isinstance(net, HPVMDNN): - dataset = dataset_factory(prefix) - non_tensor_output = False - elif isinstance(net, HPVMDefaultModule): # Is image benchmark - dataset = dataset_factory(prefix) - non_tensor_output = True - else: - dataset = dataset_factory('./data') - non_tensor_output = False - # 3. Split dataset - test_set, val_set = cls._split_dataset(dataset, 5000) - test_loader = DataLoader(test_set, batch_size=batchsize) - val_loader = DataLoader(val_set, batch_size=batchsize) - return cls(net, val_loader, test_loader, non_tensor_output, qos_class) diff --git a/hpvm/projects/pred_tuner/models/networks.py b/hpvm/projects/pred_tuner/models/networks.py deleted file mode 100644 index a5611bcb3e681c618cc5f8d8d188e9afc2fb5687..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/networks.py +++ /dev/null @@ -1,54 +0,0 @@ -from . import hpvm -from .datasets import CIFAR, CIFARImage, MNIST, get_cifar10_test_dataset -from .domains import Accuracy -from .domains.qoses import AccuracyPSNR -from .torch import ResNet18, VGG - - -networks = { - 'lenet_hpvm': ( - hpvm.LeNet, MNIST.from_default_file, 5000, - 'model_params/lenet_mnist', Accuracy - ), - 'alexnet_hpvm': ( - hpvm.AlexNet, CIFAR.from_default_file, 2000, - 'model_params/alexnet_cifar10', Accuracy - ), - 'alexnet2_hpvm': ( - hpvm.AlexNet2, CIFAR.from_default_file, 2000, - 'model_params/alexnet2_cifar10', Accuracy - ), - 'vgg16_cifar10_hpvm': ( - hpvm.VGG16Cifar10, CIFAR.from_default_file, 500, - 'model_params/vgg16_cifar10', Accuracy - ), - 'vgg16_cifar100_hpvm': ( - hpvm.VGG16Cifar100, CIFAR.from_default_file, 500, - 'model_params/vgg16_cifar100', Accuracy - ), - 'mobilenet_hpvm': ( - hpvm.MobileNet, CIFAR.from_default_file, 1000, - 'model_params/mobilenet', Accuracy - ), - 'resnet18_hpvm': ( - hpvm.ResNet18, CIFAR.from_default_file, 1000, - 'model_params/resnet18_cifar10', Accuracy - ), - 'alexnet_imagenet_hpvm': ( - hpvm.AlexNetImageNet, CIFAR.from_default_file, 100, - 'model_params/alexnet_imagenet', Accuracy - ), - 'vgg16_imagenet_hpvm': ( - hpvm.VGG16ImageNet, CIFAR.from_default_file, 50, - 'model_params/vgg16_imagenet', Accuracy - ), - 'resnet50_imagenet_hpvm': ( - hpvm.ResNet50, CIFAR.from_default_file, 25, - 'model_params/resnet50_imagenet', Accuracy - ), - 'alexnet2_canny_hpvm': ( - lambda: hpvm.AlexNet2Canny(on_classes=[1, 2, 3, 4, 5]), - CIFARImage.from_default_file, 50, - 'model_params/alexnet2_canny', AccuracyPSNR - ) -} diff --git a/hpvm/projects/pred_tuner/models/torch/__init__.py b/hpvm/projects/pred_tuner/models/torch/__init__.py deleted file mode 100644 index aff98ce114a9f0797ed08e74db1184d727f94f2e..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from .vgg import * -from .dpn import * -from .lenet import * -from .senet import * -from .pnasnet import * -from .densenet import * -from .googlenet import * -from .shufflenet import * -from .shufflenetv2 import * -from .resnet import * -from .resnext import * -from .preact_resnet import * -from .mobilenet import * -from .mobilenetv2 import * -from .efficientnet import * diff --git a/hpvm/projects/pred_tuner/models/torch/densenet.py b/hpvm/projects/pred_tuner/models/torch/densenet.py deleted file mode 100644 index 47ebbbe08e40503d6785711acd8bd7dd2cdba768..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/densenet.py +++ /dev/null @@ -1,107 +0,0 @@ -'''DenseNet in PyTorch.''' -import math - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Bottleneck(nn.Module): - def __init__(self, in_planes, growth_rate): - super(Bottleneck, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(4*growth_rate) - self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) - - def forward(self, x): - out = self.conv1(F.relu(self.bn1(x))) - out = self.conv2(F.relu(self.bn2(out))) - out = torch.cat([out,x], 1) - return out - - -class Transition(nn.Module): - def __init__(self, in_planes, out_planes): - super(Transition, self).__init__() - self.bn = nn.BatchNorm2d(in_planes) - self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) - - def forward(self, x): - out = self.conv(F.relu(self.bn(x))) - out = F.avg_pool2d(out, 2) - return out - - -class DenseNet(nn.Module): - def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): - super(DenseNet, self).__init__() - self.growth_rate = growth_rate - - num_planes = 2*growth_rate - self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) - - self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) - num_planes += nblocks[0]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans1 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) - num_planes += nblocks[1]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans2 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) - num_planes += nblocks[2]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans3 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) - num_planes += nblocks[3]*growth_rate - - self.bn = nn.BatchNorm2d(num_planes) - self.linear = nn.Linear(num_planes, num_classes) - - def _make_dense_layers(self, block, in_planes, nblock): - layers = [] - for i in range(nblock): - layers.append(block(in_planes, self.growth_rate)) - in_planes += self.growth_rate - return nn.Sequential(*layers) - - def forward(self, x): - out = self.conv1(x) - out = self.trans1(self.dense1(out)) - out = self.trans2(self.dense2(out)) - out = self.trans3(self.dense3(out)) - out = self.dense4(out) - out = F.avg_pool2d(F.relu(self.bn(out)), 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - -def DenseNet121(): - return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) - -def DenseNet169(): - return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) - -def DenseNet201(): - return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) - -def DenseNet161(): - return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) - -def densenet_cifar(): - return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) - -def test(): - net = densenet_cifar() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/dpn.py b/hpvm/projects/pred_tuner/models/torch/dpn.py deleted file mode 100644 index d334367fcc9876b104a94b7ae333362ea0a64469..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/dpn.py +++ /dev/null @@ -1,98 +0,0 @@ -'''Dual Path Networks in PyTorch.''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Bottleneck(nn.Module): - def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): - super(Bottleneck, self).__init__() - self.out_planes = out_planes - self.dense_depth = dense_depth - - self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) - self.bn2 = nn.BatchNorm2d(in_planes) - self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes+dense_depth) - - self.shortcut = nn.Sequential() - if first_layer: - self.shortcut = nn.Sequential( - nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(out_planes+dense_depth) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - x = self.shortcut(x) - d = self.out_planes - out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) - out = F.relu(out) - return out - - -class DPN(nn.Module): - def __init__(self, cfg): - super(DPN, self).__init__() - in_planes, out_planes = cfg['in_planes'], cfg['out_planes'] - num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth'] - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.last_planes = 64 - self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) - self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) - self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2) - self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) - self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10) - - def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for i,stride in enumerate(strides): - layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) - self.last_planes = out_planes + (i+2) * dense_depth - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def DPN26(): - cfg = { - 'in_planes': (96,192,384,768), - 'out_planes': (256,512,1024,2048), - 'num_blocks': (2,2,2,2), - 'dense_depth': (16,32,24,128) - } - return DPN(cfg) - -def DPN92(): - cfg = { - 'in_planes': (96,192,384,768), - 'out_planes': (256,512,1024,2048), - 'num_blocks': (3,4,20,3), - 'dense_depth': (16,32,24,128) - } - return DPN(cfg) - - -def test(): - net = DPN92() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/efficientnet.py b/hpvm/projects/pred_tuner/models/torch/efficientnet.py deleted file mode 100644 index 6a10a97468b5a505d5ea4bf1b5b53859dacef233..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/efficientnet.py +++ /dev/null @@ -1,99 +0,0 @@ -'''EfficientNet in PyTorch. - -Paper: "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks". -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''expand + depthwise + pointwise + squeeze-excitation''' - - def __init__(self, in_planes, out_planes, expansion, stride): - super(Block, self).__init__() - self.stride = stride - - planes = expansion * in_planes - self.conv1 = nn.Conv2d( - in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, - stride=stride, padding=1, groups=planes, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d( - planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes) - - self.shortcut = nn.Sequential() - if stride == 1 and in_planes != out_planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, out_planes, kernel_size=1, - stride=1, padding=0, bias=False), - nn.BatchNorm2d(out_planes), - ) - - # SE layers - self.fc1 = nn.Conv2d(out_planes, out_planes//16, kernel_size=1) - self.fc2 = nn.Conv2d(out_planes//16, out_planes, kernel_size=1) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - shortcut = self.shortcut(x) if self.stride == 1 else out - # Squeeze-Excitation - w = F.avg_pool2d(out, out.size(2)) - w = F.relu(self.fc1(w)) - w = self.fc2(w).sigmoid() - out = out * w + shortcut - return out - - -class EfficientNet(nn.Module): - def __init__(self, cfg, num_classes=10): - super(EfficientNet, self).__init__() - self.cfg = cfg - self.conv1 = nn.Conv2d(3, 32, kernel_size=3, - stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(32) - self.layers = self._make_layers(in_planes=32) - self.linear = nn.Linear(cfg[-1][1], num_classes) - - def _make_layers(self, in_planes): - layers = [] - for expansion, out_planes, num_blocks, stride in self.cfg: - strides = [stride] + [1]*(num_blocks-1) - for stride in strides: - layers.append(Block(in_planes, out_planes, expansion, stride)) - in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layers(out) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def EfficientNetB0(): - # (expansion, out_planes, num_blocks, stride) - cfg = [(1, 16, 1, 2), - (6, 24, 2, 1), - (6, 40, 2, 2), - (6, 80, 3, 2), - (6, 112, 3, 1), - (6, 192, 4, 2), - (6, 320, 1, 2)] - return EfficientNet(cfg) - - -def test(): - net = EfficientNetB0() - x = torch.randn(2, 3, 32, 32) - y = net(x) - print(y.shape) - - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/googlenet.py b/hpvm/projects/pred_tuner/models/torch/googlenet.py deleted file mode 100644 index 8ed8f6eb236d966f206f457e1637e11fecd44408..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/googlenet.py +++ /dev/null @@ -1,106 +0,0 @@ -"""GoogLeNet with PyTorch.""" -import torch -import torch.nn as nn - - -class Inception(nn.Module): - def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): - super(Inception, self).__init__() - # 1x1 conv branch - self.b1 = nn.Sequential( - nn.Conv2d(in_planes, n1x1, kernel_size=1), - nn.BatchNorm2d(n1x1), - nn.ReLU(True), - ) - - # 1x1 conv -> 3x3 conv branch - self.b2 = nn.Sequential( - nn.Conv2d(in_planes, n3x3red, kernel_size=1), - nn.BatchNorm2d(n3x3red), - nn.ReLU(True), - nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), - nn.BatchNorm2d(n3x3), - nn.ReLU(True), - ) - - # 1x1 conv -> 5x5 conv branch - self.b3 = nn.Sequential( - nn.Conv2d(in_planes, n5x5red, kernel_size=1), - nn.BatchNorm2d(n5x5red), - nn.ReLU(True), - nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), - nn.BatchNorm2d(n5x5), - nn.ReLU(True), - nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), - nn.BatchNorm2d(n5x5), - nn.ReLU(True), - ) - - # 3x3 pool -> 1x1 conv branch - self.b4 = nn.Sequential( - nn.MaxPool2d(3, stride=1, padding=1), - nn.Conv2d(in_planes, pool_planes, kernel_size=1), - nn.BatchNorm2d(pool_planes), - nn.ReLU(True), - ) - - def forward(self, x): - y1 = self.b1(x) - y2 = self.b2(x) - y3 = self.b3(x) - y4 = self.b4(x) - return torch.cat([y1, y2, y3, y4], 1) - - -class GoogLeNet(nn.Module): - def __init__(self): - super(GoogLeNet, self).__init__() - self.pre_layers = nn.Sequential( - nn.Conv2d(3, 192, kernel_size=3, padding=1), - nn.BatchNorm2d(192), - nn.ReLU(True), - ) - - self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) - self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) - - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - - self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) - self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) - self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) - self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) - self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) - - self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) - self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) - - self.avgpool = nn.AvgPool2d(8, stride=1) - self.linear = nn.Linear(1024, 10) - - def forward(self, x): - out = self.pre_layers(x) - out = self.a3(out) - out = self.b3(out) - out = self.maxpool(out) - out = self.a4(out) - out = self.b4(out) - out = self.c4(out) - out = self.d4(out) - out = self.e4(out) - out = self.maxpool(out) - out = self.a5(out) - out = self.b5(out) - out = self.avgpool(out) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = GoogLeNet() - x = torch.randn(1, 3, 32, 32) - y = net(x) - print(y.size()) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/lenet.py b/hpvm/projects/pred_tuner/models/torch/lenet.py deleted file mode 100644 index d657b7482a75a3058e5795f367dfbb32e948b9d5..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/lenet.py +++ /dev/null @@ -1,23 +0,0 @@ -'''LeNet in PyTorch.''' -import torch.nn as nn -import torch.nn.functional as F - -class LeNet(nn.Module): - def __init__(self): - super(LeNet, self).__init__() - self.conv1 = nn.Conv2d(3, 6, 5) - self.conv2 = nn.Conv2d(6, 16, 5) - self.fc1 = nn.Linear(16*5*5, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - out = F.relu(self.conv1(x)) - out = F.max_pool2d(out, 2) - out = F.relu(self.conv2(out)) - out = F.max_pool2d(out, 2) - out = out.view(out.size(0), -1) - out = F.relu(self.fc1(out)) - out = F.relu(self.fc2(out)) - out = self.fc3(out) - return out diff --git a/hpvm/projects/pred_tuner/models/torch/mobilenet.py b/hpvm/projects/pred_tuner/models/torch/mobilenet.py deleted file mode 100644 index 497ef1e867d2a597b9b444ebc7a6f30cd5219777..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/mobilenet.py +++ /dev/null @@ -1,61 +0,0 @@ -'''MobileNet in PyTorch. - -See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" -for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''Depthwise conv + Pointwise conv''' - def __init__(self, in_planes, out_planes, stride=1): - super(Block, self).__init__() - self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - return out - - -class MobileNet(nn.Module): - # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 - cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] - - def __init__(self, num_classes=10): - super(MobileNet, self).__init__() - self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(32) - self.layers = self._make_layers(in_planes=32) - self.linear = nn.Linear(1024, num_classes) - - def _make_layers(self, in_planes): - layers = [] - for x in self.cfg: - out_planes = x if isinstance(x, int) else x[0] - stride = 1 if isinstance(x, int) else x[1] - layers.append(Block(in_planes, out_planes, stride)) - in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layers(out) - out = F.avg_pool2d(out, 2) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = MobileNet() - x = torch.randn(1,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py b/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py deleted file mode 100644 index 17e5823ef4426ceceae462782a267f89b1ecbc76..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/mobilenetv2.py +++ /dev/null @@ -1,86 +0,0 @@ -'''MobileNetV2 in PyTorch. - -See the paper "Inverted Residuals and Linear Bottlenecks: -Mobile Networks for Classification, Detection and Segmentation" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''expand + depthwise + pointwise''' - def __init__(self, in_planes, out_planes, expansion, stride): - super(Block, self).__init__() - self.stride = stride - - planes = expansion * in_planes - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes) - - self.shortcut = nn.Sequential() - if stride == 1 and in_planes != out_planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(out_planes), - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out = out + self.shortcut(x) if self.stride==1 else out - return out - - -class MobileNetV2(nn.Module): - # (expansion, out_planes, num_blocks, stride) - cfg = [(1, 16, 1, 1), - (6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10 - (6, 32, 3, 2), - (6, 64, 4, 2), - (6, 96, 3, 1), - (6, 160, 3, 2), - (6, 320, 1, 1)] - - def __init__(self, num_classes=10): - super(MobileNetV2, self).__init__() - # NOTE: change conv1 stride 2 -> 1 for CIFAR10 - self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(32) - self.layers = self._make_layers(in_planes=32) - self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(1280) - self.linear = nn.Linear(1280, num_classes) - - def _make_layers(self, in_planes): - layers = [] - for expansion, out_planes, num_blocks, stride in self.cfg: - strides = [stride] + [1]*(num_blocks-1) - for stride in strides: - layers.append(Block(in_planes, out_planes, expansion, stride)) - in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layers(out) - out = F.relu(self.bn2(self.conv2(out))) - # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10 - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = MobileNetV2() - x = torch.randn(2,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/pnasnet.py b/hpvm/projects/pred_tuner/models/torch/pnasnet.py deleted file mode 100644 index de8c4d51f2667f84eab86f29be9a00ea7d0ad1c3..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/pnasnet.py +++ /dev/null @@ -1,125 +0,0 @@ -'''PNASNet in PyTorch. - -Paper: Progressive Neural Architecture Search -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class SepConv(nn.Module): - '''Separable Convolution.''' - def __init__(self, in_planes, out_planes, kernel_size, stride): - super(SepConv, self).__init__() - self.conv1 = nn.Conv2d(in_planes, out_planes, - kernel_size, stride, - padding=(kernel_size-1)//2, - bias=False, groups=in_planes) - self.bn1 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - return self.bn1(self.conv1(x)) - - -class CellA(nn.Module): - def __init__(self, in_planes, out_planes, stride=1): - super(CellA, self).__init__() - self.stride = stride - self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) - if stride==2: - self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - y1 = self.sep_conv1(x) - y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) - if self.stride==2: - y2 = self.bn1(self.conv1(y2)) - return F.relu(y1+y2) - -class CellB(nn.Module): - def __init__(self, in_planes, out_planes, stride=1): - super(CellB, self).__init__() - self.stride = stride - # Left branch - self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) - self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride) - # Right branch - self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride) - if stride==2: - self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(out_planes) - # Reduce channels - self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - # Left branch - y1 = self.sep_conv1(x) - y2 = self.sep_conv2(x) - # Right branch - y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) - if self.stride==2: - y3 = self.bn1(self.conv1(y3)) - y4 = self.sep_conv3(x) - # Concat & reduce channels - b1 = F.relu(y1+y2) - b2 = F.relu(y3+y4) - y = torch.cat([b1,b2], 1) - return F.relu(self.bn2(self.conv2(y))) - -class PNASNet(nn.Module): - def __init__(self, cell_type, num_cells, num_planes): - super(PNASNet, self).__init__() - self.in_planes = num_planes - self.cell_type = cell_type - - self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(num_planes) - - self.layer1 = self._make_layer(num_planes, num_cells=6) - self.layer2 = self._downsample(num_planes*2) - self.layer3 = self._make_layer(num_planes*2, num_cells=6) - self.layer4 = self._downsample(num_planes*4) - self.layer5 = self._make_layer(num_planes*4, num_cells=6) - - self.linear = nn.Linear(num_planes*4, 10) - - def _make_layer(self, planes, num_cells): - layers = [] - for _ in range(num_cells): - layers.append(self.cell_type(self.in_planes, planes, stride=1)) - self.in_planes = planes - return nn.Sequential(*layers) - - def _downsample(self, planes): - layer = self.cell_type(self.in_planes, planes, stride=2) - self.in_planes = planes - return layer - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = self.layer5(out) - out = F.avg_pool2d(out, 8) - out = self.linear(out.view(out.size(0), -1)) - return out - - -def PNASNetA(): - return PNASNet(CellA, num_cells=6, num_planes=44) - -def PNASNetB(): - return PNASNet(CellB, num_cells=6, num_planes=32) - - -def test(): - net = PNASNetB() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/preact_resnet.py b/hpvm/projects/pred_tuner/models/torch/preact_resnet.py deleted file mode 100644 index abb1bc313c011d2ee650c353c515e2cd404503f3..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/preact_resnet.py +++ /dev/null @@ -1,118 +0,0 @@ -'''Pre-activation ResNet in PyTorch. - -Reference: -[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun - Identity Mappings in Deep Residual Networks. arXiv:1603.05027 -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class PreActBlock(nn.Module): - '''Pre-activation version of the BasicBlock.''' - expansion = 1 - - def __init__(self, in_planes, planes, stride=1): - super(PreActBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) - ) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - out += shortcut - return out - - -class PreActBottleneck(nn.Module): - '''Pre-activation version of the original Bottleneck module.''' - expansion = 4 - - def __init__(self, in_planes, planes, stride=1): - super(PreActBottleneck, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) - - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) - ) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - out = self.conv3(F.relu(self.bn3(out))) - out += shortcut - return out - - -class PreActResNet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(PreActResNet, self).__init__() - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512*block.expansion, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes * block.expansion - return nn.Sequential(*layers) - - def forward(self, x): - out = self.conv1(x) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def PreActResNet18(): - return PreActResNet(PreActBlock, [2,2,2,2]) - -def PreActResNet34(): - return PreActResNet(PreActBlock, [3,4,6,3]) - -def PreActResNet50(): - return PreActResNet(PreActBottleneck, [3,4,6,3]) - -def PreActResNet101(): - return PreActResNet(PreActBottleneck, [3,4,23,3]) - -def PreActResNet152(): - return PreActResNet(PreActBottleneck, [3,8,36,3]) - - -def test(): - net = PreActResNet18() - y = net((torch.randn(1,3,32,32))) - print(y.size()) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/resnet.py b/hpvm/projects/pred_tuner/models/torch/resnet.py deleted file mode 100644 index d7c03ed134293e2a6a1dd373556e83978ef3d560..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/resnet.py +++ /dev/null @@ -1,122 +0,0 @@ -"""ResNet in PyTorch. - -For Pre-activation ResNet, see 'preact_resnet.py'. - -Reference: -[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun - Deep Residual Learning for Image Recognition. arXiv:1512.03385 -""" -import torch.nn as nn -import torch.nn.functional as F - -from models.hpvm import HPVMConvBundle - - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__(self, in_planes, planes, stride=1): - super(BasicBlock, self).__init__() - self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.relu1 = nn.ReLU() - self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion * planes: - self.shortcut = nn.Sequential( - HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes) - ) - self.relu2 = nn.ReLU() - - def forward(self, x): - out = self.relu1(self.bn1(self.conv1(x))) - out = self.bn2(self.conv2(out)) - out += self.shortcut(x) - out = self.relu2(out) - return out - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__(self, in_planes, planes, stride=1): - super(Bottleneck, self).__init__() - self.conv1 = HPVMConvBundle(in_planes, planes, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = HPVMConvBundle(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = HPVMConvBundle(planes, self.expansion * planes, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(self.expansion * planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion * planes: - self.shortcut = nn.Sequential( - HPVMConvBundle(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion * planes) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out += self.shortcut(x) - out = F.relu(out) - return out - - -class ResNet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(ResNet, self).__init__() - self.in_planes = 64 - - self.conv1 = HPVMConvBundle(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.relu = nn.ReLU() - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.avg_pool2d = nn.AvgPool2d(4) - self.linear = nn.Linear(512 * block.expansion, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1] * (num_blocks - 1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes * block.expansion - return nn.Sequential(*layers) - - def forward(self, x): - out = self.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = self.avg_pool2d(out) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ResNet18(): - return ResNet(BasicBlock, [2, 2, 2, 2]) - - -def ResNet34(): - return ResNet(BasicBlock, [3, 4, 6, 3]) - - -def ResNet50(): - return ResNet(Bottleneck, [3, 4, 6, 3]) - - -def ResNet101(): - return ResNet(Bottleneck, [3, 4, 23, 3]) - - -def ResNet152(): - return ResNet(Bottleneck, [3, 8, 36, 3]) diff --git a/hpvm/projects/pred_tuner/models/torch/resnext.py b/hpvm/projects/pred_tuner/models/torch/resnext.py deleted file mode 100644 index 7a08f3e7d9fdf3b65aad5b773d4d113c6b796423..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/resnext.py +++ /dev/null @@ -1,95 +0,0 @@ -'''ResNeXt in PyTorch. - -See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''Grouped convolution block.''' - expansion = 2 - - def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): - super(Block, self).__init__() - group_width = cardinality * bottleneck_width - self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(group_width) - self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) - self.bn2 = nn.BatchNorm2d(group_width) - self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(self.expansion*group_width) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*group_width: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*group_width) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out += self.shortcut(x) - out = F.relu(out) - return out - - -class ResNeXt(nn.Module): - def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): - super(ResNeXt, self).__init__() - self.cardinality = cardinality - self.bottleneck_width = bottleneck_width - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(num_blocks[0], 1) - self.layer2 = self._make_layer(num_blocks[1], 2) - self.layer3 = self._make_layer(num_blocks[2], 2) - # self.layer4 = self._make_layer(num_blocks[3], 2) - self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) - - def _make_layer(self, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) - self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width - # Increase bottleneck_width by 2 after each stage. - self.bottleneck_width *= 2 - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - # out = self.layer4(out) - out = F.avg_pool2d(out, 8) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ResNeXt29_2x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64) - -def ResNeXt29_4x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64) - -def ResNeXt29_8x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64) - -def ResNeXt29_32x4d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4) - -def test_resnext(): - net = ResNeXt29_2x64d() - x = torch.randn(1,3,32,32) - y = net(x) - print(y.size()) - -# test_resnext() diff --git a/hpvm/projects/pred_tuner/models/torch/senet.py b/hpvm/projects/pred_tuner/models/torch/senet.py deleted file mode 100644 index 98bfa0ca51dcd07b586432c9f9460be8d1f0b745..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/senet.py +++ /dev/null @@ -1,121 +0,0 @@ -'''SENet in PyTorch. - -SENet is the winner of ImageNet-2017. The paper is not released yet. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class BasicBlock(nn.Module): - def __init__(self, in_planes, planes, stride=1): - super(BasicBlock, self).__init__() - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(planes) - ) - - # SE layers - self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear - self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.bn2(self.conv2(out)) - - # Squeeze - w = F.avg_pool2d(out, out.size(2)) - w = F.relu(self.fc1(w)) - w = F.sigmoid(self.fc2(w)) - # Excitation - out = out * w # New broadcasting feature from v0.2! - - out += self.shortcut(x) - out = F.relu(out) - return out - - -class PreActBlock(nn.Module): - def __init__(self, in_planes, planes, stride=1): - super(PreActBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - - if stride != 1 or in_planes != planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False) - ) - - # SE layers - self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) - self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - - # Squeeze - w = F.avg_pool2d(out, out.size(2)) - w = F.relu(self.fc1(w)) - w = F.sigmoid(self.fc2(w)) - # Excitation - out = out * w - - out += shortcut - return out - - -class SENet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(SENet, self).__init__() - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def SENet18(): - return SENet(PreActBlock, [2,2,2,2]) - - -def test(): - net = SENet18() - y = net(torch.randn(1,3,32,32)) - print(y.size()) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/shufflenet.py b/hpvm/projects/pred_tuner/models/torch/shufflenet.py deleted file mode 100644 index acff6f78266c55bb93f5b12a6306a5647ebb0769..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/shufflenet.py +++ /dev/null @@ -1,109 +0,0 @@ -'''ShuffleNet in PyTorch. - -See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class ShuffleBlock(nn.Module): - def __init__(self, groups): - super(ShuffleBlock, self).__init__() - self.groups = groups - - def forward(self, x): - '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' - N,C,H,W = x.size() - g = self.groups - return x.view(N,g,C//g,H,W).permute(0,2,1,3,4).reshape(N,C,H,W) - - -class Bottleneck(nn.Module): - def __init__(self, in_planes, out_planes, stride, groups): - super(Bottleneck, self).__init__() - self.stride = stride - - mid_planes = out_planes/4 - g = 1 if in_planes==24 else groups - self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False) - self.bn1 = nn.BatchNorm2d(mid_planes) - self.shuffle1 = ShuffleBlock(groups=g) - self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False) - self.bn2 = nn.BatchNorm2d(mid_planes) - self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes) - - self.shortcut = nn.Sequential() - if stride == 2: - self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1)) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.shuffle1(out) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - res = self.shortcut(x) - out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res) - return out - - -class ShuffleNet(nn.Module): - def __init__(self, cfg): - super(ShuffleNet, self).__init__() - out_planes = cfg['out_planes'] - num_blocks = cfg['num_blocks'] - groups = cfg['groups'] - - self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(24) - self.in_planes = 24 - self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups) - self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups) - self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups) - self.linear = nn.Linear(out_planes[2], 10) - - def _make_layer(self, out_planes, num_blocks, groups): - layers = [] - for i in range(num_blocks): - stride = 2 if i == 0 else 1 - cat_planes = self.in_planes if i == 0 else 0 - layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups)) - self.in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ShuffleNetG2(): - cfg = { - 'out_planes': [200,400,800], - 'num_blocks': [4,8,4], - 'groups': 2 - } - return ShuffleNet(cfg) - -def ShuffleNetG3(): - cfg = { - 'out_planes': [240,480,960], - 'num_blocks': [4,8,4], - 'groups': 3 - } - return ShuffleNet(cfg) - - -def test(): - net = ShuffleNetG2() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py b/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py deleted file mode 100644 index eefcda32059f0b8575148098c78ff5d84effd388..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/shufflenetv2.py +++ /dev/null @@ -1,162 +0,0 @@ -'''ShuffleNetV2 in PyTorch. - -See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class ShuffleBlock(nn.Module): - def __init__(self, groups=2): - super(ShuffleBlock, self).__init__() - self.groups = groups - - def forward(self, x): - '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' - N, C, H, W = x.size() - g = self.groups - return x.view(N, g, C//g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W) - - -class SplitBlock(nn.Module): - def __init__(self, ratio): - super(SplitBlock, self).__init__() - self.ratio = ratio - - def forward(self, x): - c = int(x.size(1) * self.ratio) - return x[:, :c, :, :], x[:, c:, :, :] - - -class BasicBlock(nn.Module): - def __init__(self, in_channels, split_ratio=0.5): - super(BasicBlock, self).__init__() - self.split = SplitBlock(split_ratio) - in_channels = int(in_channels * split_ratio) - self.conv1 = nn.Conv2d(in_channels, in_channels, - kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(in_channels) - self.conv2 = nn.Conv2d(in_channels, in_channels, - kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False) - self.bn2 = nn.BatchNorm2d(in_channels) - self.conv3 = nn.Conv2d(in_channels, in_channels, - kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(in_channels) - self.shuffle = ShuffleBlock() - - def forward(self, x): - x1, x2 = self.split(x) - out = F.relu(self.bn1(self.conv1(x2))) - out = self.bn2(self.conv2(out)) - out = F.relu(self.bn3(self.conv3(out))) - out = torch.cat([x1, out], 1) - out = self.shuffle(out) - return out - - -class DownBlock(nn.Module): - def __init__(self, in_channels, out_channels): - super(DownBlock, self).__init__() - mid_channels = out_channels // 2 - # left - self.conv1 = nn.Conv2d(in_channels, in_channels, - kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False) - self.bn1 = nn.BatchNorm2d(in_channels) - self.conv2 = nn.Conv2d(in_channels, mid_channels, - kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(mid_channels) - # right - self.conv3 = nn.Conv2d(in_channels, mid_channels, - kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(mid_channels) - self.conv4 = nn.Conv2d(mid_channels, mid_channels, - kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False) - self.bn4 = nn.BatchNorm2d(mid_channels) - self.conv5 = nn.Conv2d(mid_channels, mid_channels, - kernel_size=1, bias=False) - self.bn5 = nn.BatchNorm2d(mid_channels) - - self.shuffle = ShuffleBlock() - - def forward(self, x): - # left - out1 = self.bn1(self.conv1(x)) - out1 = F.relu(self.bn2(self.conv2(out1))) - # right - out2 = F.relu(self.bn3(self.conv3(x))) - out2 = self.bn4(self.conv4(out2)) - out2 = F.relu(self.bn5(self.conv5(out2))) - # concat - out = torch.cat([out1, out2], 1) - out = self.shuffle(out) - return out - - -class ShuffleNetV2(nn.Module): - def __init__(self, net_size): - super(ShuffleNetV2, self).__init__() - out_channels = configs[net_size]['out_channels'] - num_blocks = configs[net_size]['num_blocks'] - - self.conv1 = nn.Conv2d(3, 24, kernel_size=3, - stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(24) - self.in_channels = 24 - self.layer1 = self._make_layer(out_channels[0], num_blocks[0]) - self.layer2 = self._make_layer(out_channels[1], num_blocks[1]) - self.layer3 = self._make_layer(out_channels[2], num_blocks[2]) - self.conv2 = nn.Conv2d(out_channels[2], out_channels[3], - kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_channels[3]) - self.linear = nn.Linear(out_channels[3], 10) - - def _make_layer(self, out_channels, num_blocks): - layers = [DownBlock(self.in_channels, out_channels)] - for i in range(num_blocks): - layers.append(BasicBlock(out_channels)) - self.in_channels = out_channels - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - # out = F.max_pool2d(out, 3, stride=2, padding=1) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = F.relu(self.bn2(self.conv2(out))) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -configs = { - 0.5: { - 'out_channels': (48, 96, 192, 1024), - 'num_blocks': (3, 7, 3) - }, - - 1: { - 'out_channels': (116, 232, 464, 1024), - 'num_blocks': (3, 7, 3) - }, - 1.5: { - 'out_channels': (176, 352, 704, 1024), - 'num_blocks': (3, 7, 3) - }, - 2: { - 'out_channels': (224, 488, 976, 2048), - 'num_blocks': (3, 7, 3) - } -} - - -def test(): - net = ShuffleNetV2(net_size=0.5) - x = torch.randn(3, 3, 32, 32) - y = net(x) - print(y.shape) - - -# test() diff --git a/hpvm/projects/pred_tuner/models/torch/vgg.py b/hpvm/projects/pred_tuner/models/torch/vgg.py deleted file mode 100644 index 2650d2f4859bedcef0de53a60c58c36b706148af..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/models/torch/vgg.py +++ /dev/null @@ -1,39 +0,0 @@ -"""VGG11/13/16/19 in Pytorch.""" -import torch.nn as nn -from models.hpvm import HPVMConvBundle - - -cfg = { - 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], -} - - -class VGG(nn.Module): - def __init__(self, vgg_name): - super(VGG, self).__init__() - self.features = self._make_layers(cfg[vgg_name]) - self.classifier = nn.Linear(512, 10) - - def forward(self, x): - out = self.features(x) - out = out.view(out.size(0), -1) - out = self.classifier(out) - return out - - @staticmethod - def _make_layers(config): - layers = [] - in_channels = 3 - for x in config: - if x == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - layers += [HPVMConvBundle(in_channels, x, kernel_size=3, padding=1), - nn.BatchNorm2d(x), - nn.ReLU(inplace=True)] - in_channels = x - layers += [nn.AvgPool2d(kernel_size=1, stride=1)] - return nn.Sequential(*layers) diff --git a/hpvm/projects/pred_tuner/run_tuner.py b/hpvm/projects/pred_tuner/run_tuner.py deleted file mode 100644 index 5470763ae01b73b51702c413bd18254f4c5b0d2f..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/run_tuner.py +++ /dev/null @@ -1,305 +0,0 @@ -#!/usr/bin/env python -# -# Development-time Tuner with Algorithmic Approximations: -# Approximations: Perforation, Sampling with varying knobs for rate, skip offset -import copy -import logging -import os -import shutil -import time -from pathlib import Path -from typing import List, Tuple - -import numpy as np -import opentuner -from opentuner import ConfigurationManipulator, EnumParameter, MeasurementInterface -from opentuner.measurement.inputmanager import FixedInputManager -from opentuner.search.objective import ThresholdAccuracyMinimizeTime -from opentuner.tuningrunmain import TuningRunMain -from torch.nn import Module -from tqdm import tqdm - -from exp import Benchmark, ConfigMeasurer, ExpState, TuningTime, batch_id, bench_tuner_data, is_dev_time -from models import get_all_output, networks, QoS -from toolkit import ConfigT -from toolkit.estimators import WeightedLinearQoSEstimator -from utils import Config, config, reapply_last_config - -msg_logger = logging.getLogger(__name__) -use_proxy = False -n_promise_valid_runs = 30 -confidence_level = 0.95 - - -def init_proxy(ni: ConfigMeasurer, pickle_path: Path): - def acc_crit(inputs_): - return ni.get_qos(inputs_, ni.val_loader) - - def threshold_eval(inputs_): - accs = np.array([acc_crit(x) for x in inputs_]) - return ni.val_qos - accs.mean() < 3.0 - - def run_model(net: Module): - return get_all_output(net, ni.val_loader) - - return WeightedLinearQoSEstimator( - ni.nas, run_model, acc_crit, threshold_eval, confidence_level, storage=pickle_path - ) - - -class Timer: - def __init__(self, timer_state: TuningTime, timer_name: str): - self.timer_state = timer_state - self.name = timer_name - self.start = None - - def __enter__(self): - self.start = time.time() - return self - - def __exit__(self, *args): - end = time.time() - interval = end - self.start - self.timer_state.add_timer(self.name, interval) - - -class TunerDriver: - def __init__(self, bench: Benchmark): - self.bench = bench - msg_logger.info(f"Tuning for model {self.bench.model_name}") - # Initialize folder. - self._init_folder(bench) - # Take a snapshot of current code. - self.take_code_snapshot() - # Initialize network information and qos thresholds - self.net_info = ConfigMeasurer.init_from_bench(self.bench) - qoses = self.net_info.val_qos, self.net_info.test_qos - qos_type = self.net_info.val_qos.__class__ - self.tuner_thres = qos_type.suggested_tuner_thresholds(self.net_info.val_qos) - self.val_thres = qos_type.suggested_val_threshold(self.net_info.val_qos) - self.test_thres = qos_type.suggested_test_threshold(self.net_info.test_qos) - # Tuner states. - self.states = ExpState(bench, qos_type, qoses) - # Current # of iteration. `ProxyTuner` will use this. - self.run_id, self.iter = 0, 0 - # Initialize proxy. - if use_proxy: - self.proxy = init_proxy(self.net_info, self.bench.result_dir / 'proxy.pkl') - else: - self.proxy = None - - @staticmethod - def _init_folder(bench: Benchmark): - def remove_file_or_folder(path: Path): - if path.is_dir(): - shutil.rmtree(child) - elif path.is_file(): - path.unlink() # Removes file despite the surprising name - - pickle_path = bench.result_dir / 'proxy.pkl' - # Remove everything in result folder except pickle file - if bench.result_dir.is_dir(): - msg_logger.warning(f"!Cleaning existing result dir = {bench.result_dir}") - for child in bench.result_dir.glob('*'): - if child == pickle_path: - continue - msg_logger.info(f" !Removing {child}") - remove_file_or_folder(child) - # Create result folder if it doesn't exist - if not bench.result_dir.is_dir(): - msg_logger.info(f"Creating output directory = {bench.result_dir}") - os.makedirs(bench.result_dir) - - def get_default_args(self): - args = opentuner.default_argparser().parse_args() - args.database = f"opentuner.db/{batch_id}.db" - args.test_limit = self.bench.autotuner_runs - parent = Path(args.database).parent - if not parent.is_dir(): - os.makedirs(parent, exist_ok=True) - return args - - def tuner_exec(self): - # Get default opentuner args - args = self.get_default_args() - # Start tuning for each threshold - for i, thres in enumerate(self.tuner_thres): - with Timer(self.states.timers, f"tuning_{i}"): - msg_logger.info( - f"Tuning goal: qos >= {thres}; keeping configs with qos >= {self.val_thres}" - ) - tuner = ProxyTuner(args, self, thres, self.val_thres) - # TuningRunMain.__init__ initializes its own logger, so we'll reapply our settings. - tuning_main = TuningRunMain(tuner, args) - reapply_last_config() - # Unleash the tuner! - tuning_main.main() - # Remove tuner progress bar - tuner.pbar.close() - self.run_id += 1 - self.iter = 0 - # Postprocess configs - self.process_configs() - - def calibrate_write_configs(self, configs: List[Config], is_test_set: bool): - write_to = self.states.tested_configs if is_test_set else self.states.validated_configs - gold_acc = self.net_info.test_qos if is_test_set else self.net_info.val_qos - for cfg in tqdm(configs, leave=False): - cfg = copy.deepcopy(cfg) - cfg: Config - flags = {k: v for k, v in enumerate(cfg.flags)} - measured_acc, confidence = self.net_info.actual_measure( - flags, cfg.total_runs, is_test_set, threshold=self.val_thres - ) - prev_acc = cfg.avg_qos - cfg.update_acc(measured_acc, confidence, gold_acc) - new_acc = cfg.avg_qos - msg_logger.debug(f"{prev_acc} (mean) -> {new_acc} (mean)") - write_to.append(cfg) - write_to.finalize_dump() - - @staticmethod - def filter_configs( - validation: List[Config], test: List[Config], - vali_threshold: QoS, test_threshold: QoS - ) -> Tuple[List[Config], List[Config]]: - # Filter validation and test set by their respective thresholds - filtered_validation = [ - c for c in validation if c.avg_loss <= vali_threshold - ] - filtered_test = [ - c for c in test if c.avg_loss <= test_threshold - ] - # Test configs also need to be a subset of validation configs. - name_to_filtered = {x.fname: x for x in filtered_test} - intersect_names = set(list(name_to_filtered.keys())).intersection( - set((x.fname for x in filtered_validation)) - ) - filtered_test_ = [name_to_filtered[fname] for fname in intersect_names] - return filtered_validation, filtered_test_ - - def process_configs(self): - # Finalize all configs because tuning is done. - # (this may not do anything now but will in the future) - self.states.all_configs.finalize_dump() - all_configs = self.states.all_configs.configs - # Pre-filter configs by a wide pareto margin - filtered_configs = config.is_pareto_efficient(all_configs, ratio=0.05, n_min=50, n_max=50) - msg_logger.info(f"Prefilter yields {len(filtered_configs)} configs from {len(all_configs)}") - self.states.filtered_configs.finalize_dump(with_configs=filtered_configs) - # Calibrate prefiltered configs (validation step) - with Timer(self.states.timers, "validate"): - self.calibrate_write_configs(filtered_configs, is_test_set=False) - validated_configs = self.states.validated_configs.configs - # Calibrate prefiltered configs on test set (test step) - with Timer(self.states.timers, "test"): - self.calibrate_write_configs(filtered_configs, is_test_set=True) - tested_configs = self.states.tested_configs.configs - # Filter valid and test set configs by thresholds - valid_configs, test_configs = self.filter_configs( - validated_configs, tested_configs, self.val_thres, self.test_thres - ) - self.states.valid_configs.finalize_dump(valid_configs) - self.states.test_configs.finalize_dump(test_configs) - # Finalize data input and plot everything. - self.states.finalize_plot() - - def take_code_snapshot(self): - import git - msg_logger.info(f"Taking git snapshot") - ref_dir = self.bench.result_dir / "references" - os.mkdir(ref_dir) - # Write current git commit (SHA id) - repo = git.Repo(search_parent_directories=True) - sha = repo.head.object.hexsha - msg_logger.info(f"Current code is at commit {sha}") - with (ref_dir / 'git_commit.txt').open('w') as f: - f.write(sha) - # Also put all outstanding code change in a diff file. - # This way changes in all git-tracked files are captured. - t = repo.head.commit.tree - with (ref_dir / 'diff.txt').open('w') as f: - f.write(repo.git.diff(t)) - - def make_config_name(self) -> str: - return f"{self.bench.model_name}_{self.run_id}_{self.iter}" - - def get_accuracy(self, cfg: ConfigT) -> Tuple[QoS, QoS, int]: - has_promise_flags = set(cfg.values()).intersection(set(range(1, 7 + 1))) - config_validation_runs = n_promise_valid_runs if has_promise_flags else 1 - if use_proxy: - mean_acc, confidence_acc = self.net_info.proxy_estimate(cfg, self.proxy) - assert has_promise_flags or (mean_acc == confidence_acc) - else: - mean_acc, _ = self.net_info.actual_measure(cfg, 1, is_test_set=False) - confidence_acc = mean_acc - return mean_acc, confidence_acc, config_validation_runs - - -class ProxyTuner(MeasurementInterface): - def __init__(self, args, driver: TunerDriver, tuner_thres: QoS, accept_thres: QoS): - self.tuner_driver = driver - self.model_info = driver.net_info - self.bench = driver.bench - self.tuner_thres = tuner_thres - self.all_configs = driver.states.all_configs - self.pbar = tqdm(total=args.test_limit, leave=False) - objective = ThresholdAccuracyMinimizeTime(tuner_thres.to_scalar()) - input_manager = FixedInputManager(size=driver.bench.get_n_layers()) - super(ProxyTuner, self).__init__( - args, program_name=self.bench.model_name, - input_manager=input_manager, objective=objective - ) - self.accept_thres = accept_thres - - def manipulator(self) -> ConfigurationManipulator: - """Define the search space by creating a ConfigurationManipulator.""" - manipulator = ConfigurationManipulator() - for ext_layer_id, knobs in self.model_info.get_knobs().items(): - manipulator.add_parameter(EnumParameter(ext_layer_id, knobs)) - return manipulator - - def seed_configurations(self): - """Provide baseline config as seed if model uses seed.""" - return [self.bench.get_baseline_config(not is_dev_time)] if self.bench.use_seed else [] - - def run(self, desired_result, input_, limit): - """Run a given configuration then return performance and accuracy.""" - cfg: ConfigT = desired_result.configuration.data - # get_accuracy gives estimation of mean accuracy and 95% confident accuracy - mean_acc, confident_acc, n_runs = self.tuner_driver.get_accuracy(cfg) - # getConfigCost returns the cost associated with the selected configuration - total_comps, speedup = self.bench.compute_config_cost(cfg) - Result = opentuner.resultsdb.models.Result() - Result.time = total_comps - # Convert QoS to scalar, because opentuner does not support custom comparable datatype - Result.accuracy = confident_acc.to_scalar(relative_to=self.tuner_thres) - - # If accuracy is acceptable, write this config - if confident_acc > self.accept_thres: - config_name = self.tuner_driver.make_config_name() - cfg_values = [cfg[layer] for layer in sorted(cfg.keys())] - writing_config = Config( - mean_acc, self.model_info.val_qos, config_name, cfg_values, - n_runs, 95.0, total_comps, speedup - ) - self.all_configs.append(writing_config) - msg_logger.debug( - f"Config chosen with accuracy (mean) = {mean_acc}, (95%) = {confident_acc} " - f"and speedup = {speedup}" - ) - self.tuner_driver.iter += 1 - self.pbar.update() - return Result - - def save_final_config(self, configuration): - """Print final configuration.""" - msg_logger.info(f"Final configuration {configuration.data}") - msg_logger.info("Done with Autotuning run") - - -if __name__ == '__main__': - assert set(networks.keys()).issubset(set(bench_tuner_data.keys())) - for network in ('alexnet2_hpvm',): - bench_: Benchmark = bench_tuner_data[network] - TunerDriver(bench_).tuner_exec() diff --git a/hpvm/projects/pred_tuner/tests/data/1_1_output.json b/hpvm/projects/pred_tuner/tests/data/1_1_output.json deleted file mode 100644 index 3892ae9622a1af68e92b11408372e3d88278ed6a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/data/1_1_output.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "('0', '0', '1', '1', '2', '0')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,", - "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,", - "ConvApproxHalf2": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000," - }, - "('0', '0', '1', '1', '2', '1')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,", - "ConvApprox": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,", - "ConvApproxHalf2": "40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000,40.000000," - }, - "('0', '0', '1', '1', '3', '0')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000," - }, - "('0', '0', '1', '1', '3', '1')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvApprox": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvApproxHalf2": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000," - }, - "('0', '0', '1', '1', '4', '0')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,", - "ConvApprox": "32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,32.000000,", - "ConvApproxHalf2": "31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375,31.984375," - }, - "('0', '0', '1', '1', '4', '1')": { - "tensorConvolution": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "FP16_Baseline": "36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,36.000000,", - "ConvSampSim": "37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,37.333332,", - "ConvApprox": "37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,37.333336,", - "ConvApproxHalf2": "37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500,37.312500," - }, - "('1', '1', '1', '1', '2', '0')": { - "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,", - "BaselineaselineonvSampSimonvApproxonvApproxHalf}, - "('1', '1', '1', '1', '2', '1')": { - "tensorConvolutionaseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,", - "FP16_BaselineonvSampSimonvApproxonvApproxHalf}, - "('1', '1', '1', '1', '3', '0')": { - "tensorConvolution": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,", - "BaselineaselineonvSampSimonvApproxonvApproxHalf}, - "('1', '1', '1', '1', '3', '1')": { - "tensorConvolutionaselineaselineonvSampSimonvApproxonvApproxHalf}, - "('1', '1', '1', '1', '4', '0')": { - "tensorConvolutionaselineaselineonvSampSimonvApproxonvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,31.984375,31.984375,0.000000,0.000000,0.000000,0.000000,0.000000," - }, - "('1', '1', '1', '1', '4', '1')": { - "tensorConvolutionaseline": "0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,36.000000,36.000000,0.000000,0.000000,0.000000,0.000000,0.000000,", - "FP16_BaselineonvSampSim": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,37.333332,37.333332,0.000000,0.000000,0.000000,0.000000,0.000000,", - "ConvApprox": "0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,37.333336,37.333336,0.000000,0.000000,0.000000,0.000000,0.000000,", - "ConvApproxHalf2": "0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,37.312500,37.312500,0.000000,0.000000,0.000000,0.000000,0.000000," - } -} diff --git a/hpvm/projects/pred_tuner/tests/data/3_3_output.json b/hpvm/projects/pred_tuner/tests/data/3_3_output.json deleted file mode 100644 index 2ccb23c01c7faff1e1c296f5d5bb667633327687..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/data/3_3_output.json +++ /dev/null @@ -1,146 +0,0 @@ -{ - "('0', '0', '1', '1', '2', '0')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "26.000000,26.000000,26.000000,26.000000,", - "ConvApprox": "26.000000,26.000000,26.000000,26.000000,", - "ConvApproxHalf2": "26.000000,26.000000,26.000000,26.000000," - }, - "('0', '0', '1', '1', '2', '1')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "56.000000,56.000000,56.000000,56.000000,", - "ConvApprox": "56.000000,56.000000,56.000000,56.000000,", - "ConvApproxHalf2": "56.000000,56.000000,56.000000,56.000000," - }, - "('0', '0', '1', '1', '3', '0')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "39.000000,39.000000,39.000000,39.000000,", - "ConvApprox": "39.000000,39.000000,39.000000,39.000000,", - "ConvApproxHalf2": "39.000000,39.000000,39.000000,39.000000," - }, - "('0', '0', '1', '1', '3', '1')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "42.000000,42.000000,42.000000,42.000000,", - "ConvApprox": "42.000000,42.000000,42.000000,42.000000,", - "ConvApproxHalf2": "42.000000,42.000000,42.000000,42.000000," - }, - "('0', '0', '1', '1', '4', '0')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,", - "ConvApprox": "36.000000,36.000000,36.000000,36.000000,", - "ConvApproxHalf2": "35.968750,35.968750,35.968750,35.968750," - }, - "('0', '0', '1', '1', '4', '1')": { - "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,", - "Baseline": "41.000000,41.000000,41.000000,41.000000,", - "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,", - "ConvSampSim": "45.333336,45.333336,45.333336,45.333336,", - "ConvApprox": "45.333336,45.333336,45.333336,45.333336,", - "ConvApproxHalf2": "45.312500,45.312500,45.312500,45.312500," - }, - "('1', '1', '1', '1', '2', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,", - "ConvApprox": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,", - "ConvApproxHalf2": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000," - }, - "('1', '1', '1', '1', '2', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,", - "ConvApprox": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,", - "ConvApproxHalf2": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000," - }, - "('1', '1', '1', '1', '3', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,", - "ConvApprox": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,", - "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000," - }, - "('1', '1', '1', '1', '3', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvApprox": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000," - }, - "('1', '1', '1', '1', '4', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,", - "ConvApprox": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,", - "ConvApproxHalf2": "16.000000,22.671875,22.671875,13.328125,25.328125,35.968750,35.968750,22.656250,25.328125,35.968750,35.968750,22.656250,18.671875,25.328125,25.328125,16.000000," - }, - "('1', '1', '1', '1', '4', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,", - "ConvSampSim": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,", - "ConvApprox": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,", - "ConvApproxHalf2": "18.656250,29.343750,29.343750,20.000000,29.328125,45.312500,45.312500,29.343750,29.328125,45.312500,45.312500,29.343750,20.000000,29.328125,29.328125,18.656250," - }, - "('1', '1', '2', '2', '2', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "12.000000,18.000000,18.000000,26.000000,", - "ConvApprox": "12.000000,18.000000,18.000000,26.000000,", - "ConvApproxHalf2": "12.000000,18.000000,18.000000,26.000000," - }, - "('1', '1', '2', '2', '2', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "24.000000,36.000000,36.000000,56.000000,", - "ConvApprox": "24.000000,36.000000,36.000000,56.000000,", - "ConvApproxHalf2": "24.000000,36.000000,36.000000,56.000000," - }, - "('1', '1', '2', '2', '3', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "18.000000,27.000000,25.500000,39.000000,", - "ConvApprox": "18.000000,27.000000,25.500000,39.000000,", - "ConvApproxHalf2": "18.000000,27.000000,25.500000,39.000000," - }, - "('1', '1', '2', '2', '3', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "18.000000,27.000000,28.500000,42.000000,", - "ConvApprox": "18.000000,27.000000,28.500000,42.000000,", - "ConvApproxHalf2": "18.000000,27.000000,28.500000,42.000000," - }, - "('1', '1', '2', '2', '4', '0')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "16.000000,22.666666,25.333334,36.000000,", - "ConvApprox": "16.000000,22.666666,25.333334,36.000000,", - "ConvApproxHalf2": "16.000000,22.671875,25.328125,35.968750," - }, - "('1', '1', '2', '2', '4', '1')": { - "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,", - "Baseline": "18.000000,27.000000,27.000000,41.000000,", - "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,", - "ConvSampSim": "18.666668,29.333332,29.333332,45.333336,", - "ConvApprox": "18.666668,29.333332,29.333332,45.333336,", - "ConvApproxHalf2": "18.656250,29.343750,29.328125,45.312500," - } -} \ No newline at end of file diff --git a/hpvm/projects/pred_tuner/tests/data/promise.json b/hpvm/projects/pred_tuner/tests/data/promise.json deleted file mode 100644 index 331ff8527a17a4ff26965e7252cc49a4c409375a..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/data/promise.json +++ /dev/null @@ -1,121 +0,0 @@ -{ - "1": [ - [ - -0.980938, - -1.976522, - -2.999873, - -4.095768, - -5.115182, - 0.0, - 5.075658, - 3.972848, - 2.912783, - 2.051733, - 1.004169, - 1.002379 - ], - 45.213196 - ], - "2": [ - [ - -1.017428, - -2.01491, - -2.951011, - -4.042611, - -4.954911, - 0.0, - 5.05412, - 3.951638, - 2.94989, - 1.99723, - 1.001167, - 0.98796 - ], - 12.535809 - ], - "3": [ - [ - -1.003108, - -2.006269, - -3.00263, - -3.97216, - -4.969401, - 0.0, - 5.012199, - 4.028375, - 2.950729, - 2.004691, - 1.004823, - 0.991805 - ], - 4.886813 - ], - "4": [ - [ - -1.006497, - -1.975768, - -3.031142, - -4.02248, - -5.061712, - 0.0, - 5.017349, - 3.992676, - 2.998843, - 2.002693, - 0.997514, - 1.00649 - ], - 3.129643 - ], - "5": [ - [ - -1.001629, - -1.976943, - -2.982565, - -3.964559, - -4.99636, - 0.0, - 4.992359, - 3.984341, - 2.990126, - 2.005831, - 1.000539, - 1.003548 - ], - 2.181237 - ], - "6": [ - [ - -1.003159, - -1.985892, - -3.005964, - -4.008651, - -4.992874, - 0.0, - 4.996098, - 4.012099, - 3.001986, - 2.001431, - 0.996138, - 0.997394 - ], - 1.362949 - ], - "7": [ - [ - -1.003133, - -1.99733, - -3.00755, - -4.007799, - -5.003314, - 0.0, - 5.000926, - 3.993208, - 2.988745, - 2.00329, - 0.99986, - 0.995669 - ], - 0.6926 - ] -} \ No newline at end of file diff --git a/hpvm/projects/pred_tuner/tests/data/quantization.json b/hpvm/projects/pred_tuner/tests/data/quantization.json deleted file mode 100644 index 723eaa2b55bc067689beae34829d27d478a0c727..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/data/quantization.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "(-4, 6)": [ - -0.132812, - -4.0, - 0.179688, - -0.40625, - 1.664062, - -2.90625, - 0.6875, - 0.960938, - 6.0, - 6.0, - 2.484375, - 2.992188 - ], - "(-2, 2)": [ - -0.109375, - -2.0, - 0.1875, - -0.40625, - 1.6875, - -2.0, - 0.6875, - 0.984375, - 2.0, - 2.0, - 2.0, - 2.0 - ], - "(-25, 8)": [ - -0.121094, - -25.0, - 0.136719, - -0.507812, - 1.683594, - -2.957031, - 0.652344, - 0.910156, - 6.96875, - 7.097656, - 2.457031, - 2.972656 - ], - "(-10, 10)": [ - -0.15625, - -10.0, - 0.15625, - -0.46875, - 1.640625, - -2.96875, - 0.625, - 0.9375, - 6.953125, - 7.1875, - 2.5, - 2.96875 - ] -} \ No newline at end of file diff --git a/hpvm/projects/pred_tuner/tests/promise.py b/hpvm/projects/pred_tuner/tests/promise.py deleted file mode 100644 index 59506d94251bfac4909b2236dc9480eb17b9ed70..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/promise.py +++ /dev/null @@ -1,87 +0,0 @@ -import json -from pathlib import Path - -import torch - -from toolkit import ModuleIndexer, NetApproxSelector -from toolkit.approxdnn import PromiseSim, quantize_256 -from utils import compute_accuracy, init_by_name, run_concat_output - -eps = 1e-5 -delta = 0.05 # Allow for some variance in promise testing - - -def gt_eps(tensor: torch.Tensor) -> bool: - return torch.any(tensor.abs() > eps).item() - - -def compare_quant(groundtruth: dict): - input_tensor = torch.tensor([-0.1, -25, 0.2, -0.4, 1.7, -2.9, 0.7, 0.99, 7, 7.2, 2.5, 3]) - for k, v in groundtruth.items(): - from ast import literal_eval as make_tuple - gt = torch.tensor(v) - ours = quantize_256(input_tensor, *make_tuple(k)) - if gt_eps(gt - ours): - print( - f"Quantization results differ by more than eps = {eps};\n" - f"parameters = {k}\ngroundtruth = {gt}\nours = {ours}" - ) - raise RuntimeError - - -def compare_promise(groundtruth: dict): - input_tensor = torch.tensor([-1, -2, -3, -4, -5, 0, 5, 4, 3, 2, 1, 1], dtype=torch.float) - N = 1000 - for k, (gt_avg, gt_error) in groundtruth.items(): - gt_avg = torch.tensor(gt_avg) - sum_, our_error = torch.zeros_like(input_tensor, dtype=torch.float), 0 - for _ in range(N): - out = PromiseSim.add_promise_noise(input_tensor, int(k)) - sum_ += out - our_error += torch.sum((out - input_tensor) ** 2).item() - our_avg = sum_ / N - our_error = our_error / N - print(gt_avg, our_avg) - if abs(our_error - gt_error) > delta * max(our_error, gt_error): - print( - f"Promise results differ by more than delta = {delta * 100:.1f}%;\n" - f"swing = {k}, groundtruth error = {gt_error}\nours = {our_error}" - ) - raise RuntimeError - - -def is_in_range(mean1: float, std1: float, mean2: float) -> bool: - return mean1 - 3.0 * std1 < mean2 < mean1 + 3.0 * std1 - - -def compare_accuracy(): - baseline, testloader, _, shapes = init_by_name('lenet_hpvm') - baseline_dag = ModuleIndexer(baseline) - nas = NetApproxSelector(baseline_dag, dev_time_only=False) - # {0: 1} -> 98.4808 0.1195 - approx1 = nas.apply_approx_by_config({3: 1}) - acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader) - assert is_in_range(0.984808, 0.001195, acc1) - # {0: 2} -> 99.5933 0.0519 - approx2 = nas.apply_approx_by_config({3: 2}) - acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader) - assert is_in_range(0.995933, 0.000519, acc2) - # {0: 3} -> 99.6723 0.0347 - approx3 = nas.apply_approx_by_config({3: 3}) - acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader) - assert is_in_range(0.996723, 0.000347, acc3) - print("Accuracy test passed.") - - -def main(): - data_folder = Path(__file__).parent / 'data' - with open(data_folder / 'quantization.json') as f: - compare_quant(json.load(f)) - with open(data_folder / 'promise.json') as f: - compare_promise(json.load(f)) - compare_accuracy() - print("Tests passed.") - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/tests/resnet50.py b/hpvm/projects/pred_tuner/tests/resnet50.py deleted file mode 100644 index 71711fbfd099d47ba047471ddde3423b297d0f56..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/resnet50.py +++ /dev/null @@ -1,33 +0,0 @@ -from toolkit import ModuleIndexer, NetApproxSelector -from utils import compute_accuracy, init_by_name, run_concat_output - - -def float_eq(f1, f2): - return abs(f1 - f2) < 1e-5 - - -def main(): - baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm') - baseline_dag = ModuleIndexer(baseline) - nas = NetApproxSelector(baseline_dag) - # baseline - baseline_output = run_concat_output(baseline_dag.module, testloader) - baseline_acc = compute_accuracy(baseline_output, testloader) - assert float_eq(baseline_acc, 0.773) - # {13: 242} -> 75.5 - approx1 = nas.apply_approx_by_config({82: 242}) - acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader) - assert float_eq(acc1, 0.755) - # {13: 242, 17: 247} -> 74.6 - approx2 = nas.apply_approx_by_config({82: 242, 108: 247}) - acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader) - assert float_eq(acc2, 0.746) - # {9: 237, 13: 242, 17: 247} -> 74.1 - approx3 = nas.apply_approx_by_config({55: 237, 82: 242, 108: 247}) - acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader) - assert float_eq(acc3, 0.741) - print("Accuracy test passed.") - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/tests/sampling.py b/hpvm/projects/pred_tuner/tests/sampling.py deleted file mode 100644 index 707506ef7b8312fda02ca646bd04d034c3eff6ea..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/tests/sampling.py +++ /dev/null @@ -1,90 +0,0 @@ -import json -from copy import deepcopy -from pathlib import Path -from typing import Tuple - -import torch - -from models.hpvm import HPVMConvBundle -from toolkit import Conv2dSampling, Conv2dSamplingFP16, FP16Approx - -eps = 1e-5, 0.05 - - -def sampling_3_3_consts() -> Tuple[torch.Tensor, torch.Tensor]: - input_tensor = torch.ones(1, 3, 4, 4) - # Filter has value [2, 1, 2, 1, 2, 1...] - filter_tensor = torch.ones(1, 3, 3, 3) - filter_tensor.view(-1)[::2] = 2 - return input_tensor, filter_tensor - - -def sampling_1_1_consts() -> Tuple[torch.Tensor, torch.Tensor]: - input_tensor = torch.ones(1, 9, 2, 2) * 2 - filter_tensor = torch.ones(4, 9, 1, 1) * 2 - return input_tensor, filter_tensor - - -def parse_tensor_str(string: str) -> torch.Tensor: - # String has an extra ',' at the end, so skipping an empty string after split - entries = [float(s) for s in string.split(',')[:-1]] - return torch.tensor(entries).cuda() - - -def compare_to_groundtruth(groundtruth: dict, const_func): - input_tensor, filter_tensor = const_func() - input_tensor = input_tensor.cuda() - o_ch, i_ch, h, w = filter_tensor.size() - assert h == w - for k, v in groundtruth.items(): - def compare(groundtruth_t: torch.Tensor, ours_t: torch.Tensor, is_fp16: bool): - diff = groundtruth_t - ours_t - eps_ = eps[1] if is_fp16 else eps[0] - is_diff = torch.any(diff.abs() > eps_).item() - if is_diff: - print( - f"Results differ by more than eps = {eps};\n" - f"parameters = {k}\n" - f"groundtruth = {groundtruth_t}\n" - f"ours = {ours_t}" - ) - raise RuntimeError - - from ast import literal_eval as make_tuple - pad_h, pad_w, stride_h, stride_w, skip_every, offset = [int(s) for s in make_tuple(k)] - conv_layer = HPVMConvBundle( - i_ch, o_ch, h, stride=(stride_h, stride_w), padding=(pad_h, pad_w) - ) - conv_layer.weight.data = filter_tensor - conv_layer.bias.data = torch.zeros_like(conv_layer.bias.data) - conv_layer = conv_layer.cuda() - our_baseline = conv_layer(input_tensor).flatten() - fp16 = FP16Approx(deepcopy(conv_layer)) - our_fp16 = fp16(input_tensor).flatten() - sampling = Conv2dSampling(skip_every, offset, 1.0, deepcopy(conv_layer)) - our_sampled = sampling(input_tensor).flatten() - sampling_fp16 = Conv2dSamplingFP16(skip_every, offset, 1.0, deepcopy(conv_layer)) - our_sampled_fp16 = sampling_fp16(input_tensor).float().flatten() - groundtruth_baseline = parse_tensor_str(v['Baseline']) - compare(groundtruth_baseline, our_baseline, False) - groundtruth_sampled1 = parse_tensor_str(v['ConvApprox']) - compare(groundtruth_sampled1, our_sampled, False) - groundtruth_sampled2 = parse_tensor_str(v['ConvSampSim']) - compare(groundtruth_sampled2, our_sampled, False) - groundtruth_baseline_fp16 = parse_tensor_str(v['FP16_Baseline']) - compare(groundtruth_baseline_fp16, our_fp16, True) - groundtruth_sampled_fp16 = parse_tensor_str(v['ConvApproxHalf2']) - compare(groundtruth_sampled_fp16, our_sampled_fp16, True) - - -def main(): - data_folder = Path(__file__).parent / 'data' - with open(data_folder / '1_1_output.json') as f: - compare_to_groundtruth(json.load(f), sampling_1_1_consts) - with open(data_folder / '3_3_output.json') as f: - compare_to_groundtruth(json.load(f), sampling_3_3_consts) - print("Tests passed.") - - -if __name__ == '__main__': - main() diff --git a/hpvm/projects/pred_tuner/toolkit/__init__.py b/hpvm/projects/pred_tuner/toolkit/__init__.py deleted file mode 100644 index 892b8c154269c99b7446c70182886b2ee92fc499..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/toolkit/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .approxdnn import Approximation, AvailableApproximations, Conv2dSampling, FP16Approx, \ - PerforateConv2dStride, PromiseSim -from .estimators import LinearCombEstimator, LinearEstimator, LinearQoSEstimator, WeightedLinearCombEstimator -from .transform import ConfigT, NetApproxSelector, StateCapturer diff --git a/hpvm/projects/pred_tuner/toolkit/approxdnn.py b/hpvm/projects/pred_tuner/toolkit/approxdnn.py deleted file mode 100644 index 06abca85d521326749902e0058b8a88e3571a611..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/toolkit/approxdnn.py +++ /dev/null @@ -1,442 +0,0 @@ -"""All approximation techniques for torch.nn layers.""" -import abc -from typing import Dict, Iterable, List, Optional, Type - -import torch -from torch.nn import Linear, Module - -from models.hpvm import HPVMConvBundle -from utils import get_tensorrt_dir - - -def interpolate_first_dim(tensor: torch.Tensor, interp_indices: Iterable[int]): - def tensor_at(idx_: int): - if idx_ in interp_indices: - raise IndexError - if idx_ < 0 or idx_ >= tensor.size()[0]: - return torch.zeros_like(tensor[0]) - return tensor[idx_] - - for idx in interp_indices: - if idx < 0 or idx >= tensor.size()[0]: - raise IndexError - elif idx == 0: # First row - tensor[idx] = tensor_at(1) - elif idx == tensor.size()[0] - 1: # Last row - tensor[idx] = tensor_at(idx - 1) - else: # Middle rows - tensor[idx] = (tensor_at(idx - 1) + tensor_at(idx + 1)) / 2.0 - return tensor - - -class Approximation(abc.ABC): - @property - @abc.abstractmethod - def deterministic(self) -> bool: - pass - - @property - @abc.abstractmethod - def devtime(self) -> bool: - pass - - @property - @abc.abstractmethod - def fp32(self) -> bool: - pass - - @abc.abstractmethod - def apply(self, module: Module) -> Module: - pass - - @abc.abstractmethod - def is_less_approx(self, other: 'Approximation') -> Optional[bool]: - pass - - def __repr__(self): - return f"{self.__class__}({self.__dict__})" - - -class PerforateConv2dStride(Approximation): - r"""Simulation of strided perforated convolution for `torch.nn.Conv2d`. - - Perforated convolution skips computing some entries in the output and instead interpolates - these values, to reduce the number of float-ops needed to complete a convolution op. - In this implementation, selected rows or columns of the output are discarded and replaced - with linearly interpolated values from the neighboring rows or columns. Each channel is - considered independently. - This implementation gives the same output as actual perforated convolution but without the - performance benefit. - - Parameters - ---------- - direction_is_row : bool - If True, discard and interpolate rows, otherwise columns. - stride : int \in [2, +\infty) - Skip 1 row/column in the convolution kernel per `stride` elements. - offset : int \in [0, stride) - Skipped first row/column is `offset`. - - Attributes - ---------- - interp_axis : int :math:`\in \{2, 3\}` - The axis that will be perforated over. As the input is an NCHW tensor, if - `direction_is_row` then `interp_axis = 2`, otherwise `interp_axis = 3`. - stride : int :math:`\in [2, +\infty)` - Equal to parameter `stride`. - offset : int :math:`\in [0, stride)` - Equal to parameter `offset`. - """ - - def __init__(self, direction_is_row: bool, stride: int, offset: int, use_fp16: bool): - assert stride >= 2 - assert 0 <= offset < stride - self.interp_axis = 2 if direction_is_row else 3 - self.stride = stride - self.offset = offset - self.fp16 = use_fp16 - - @property - def deterministic(self) -> bool: - return True - - @property - def devtime(self) -> bool: - return not self.fp16 - - @property - def fp32(self) -> bool: - return not self.fp16 - - def is_less_approx(self, other: Approximation) -> Optional[bool]: - return None - - class PerforateConv2dStrideModule(Module): - def __init__(self, conv: HPVMConvBundle, approx: 'PerforateConv2dStride'): - super().__init__() - self.conv = conv - self.approx = approx - if self.approx.fp16: - self.conv = self.conv.half() - - def forward(self, x: torch.Tensor): - if self.approx.fp16: - x = x.half() - x = self.conv.input_to_conv(x) - assert x.dim() == 4 - # Put self.approx.interp_axis to first axis temporarily - x = x.transpose(0, self.approx.interp_axis) - interp_indices = torch.tensor(range(self.approx.offset, x.size(0), self.approx.stride)) - x = interpolate_first_dim(x, interp_indices) - # Putting axes back - x = x.transpose(0, self.approx.interp_axis) - x = self.conv.conv_to_output(x) - if self.approx.fp16: - assert x.dtype == torch.float16 - return x.float() - - def apply(self, module: HPVMConvBundle) -> PerforateConv2dStrideModule: - return self.PerforateConv2dStrideModule(module, self) - - -class Conv2dSampling(Approximation): - r"""Simulation of sampled convolution for `torch.nn.Conv2d`. - - Skips some elements of the convolution kernel in a uniform, strided manner, - to reduce the amount of float-ops needed to compute each output entry. - This implementation gives the same output as actual sampled convolution but without the - performance benefit. - - Parameters - ---------- - skip_every: int - Skip 1 element in the convolution kernel per `skip_every` elements. - skip_offset : int :math:`\in [0, +\infty)` - Index of first element to be skipped. - For example, if `skip_every = 3` and `skip_offset = 1`, then indices skipped - will be [1, 4, 7, ...] - interp_rate : float - The weight will be compensated ("interpolated") with a ratio after skipping elements, - which is naturally equal to :math:`1 + (1 / (skip_every - 1)`. - `interp_rate` modifies this rate to :math:`1 + (1 / (skip_every - 1) \times interp_rate`. - use_fp16 : bool - Whether to use fp16 weight/input or not. - """ - - def __init__( - self, skip_every: int, skip_offset: int, interp_rate: float, use_fp16: bool - ): - assert skip_every >= 2 and skip_offset >= 0 - self.skip_every = skip_every - self.skip_offset = skip_offset - self.interp_rate = interp_rate - self.fp16 = use_fp16 - - @property - def deterministic(self) -> bool: - return True - - @property - def devtime(self) -> bool: - return not self.fp16 - - @property - def fp32(self) -> bool: - return not self.fp16 - - def is_less_approx(self, other: Approximation) -> Optional[bool]: - return None - - @staticmethod - def sample_conv_weight( - interp_rate: float, skip_every: int, skip_offset: int, weight: torch.Tensor - ): - r"""Samples (skips & interpolates) convolution kernel according to parameters. - - For a given `weight` tensor of shape `(C1, C2, H, W)`, sample each output channel - (on axis 0) independently. - Flatten each output channel tensor into 1 dim. - In normal cases, set elements at indices ``range(skip_offset, C_2 * H * W, skip_every)`` - to 0. - However, if `skip_every` == `h` == `w` == 3, we may end up skipping the same whole rows for - each input channel, which is undesirable. - Instead, increment the offset by 1 for each input channel. - Last, multiplies the kernel by the inverse ratio of elements dropped for an interpolation. - """ - if len(weight.shape) != 4: - raise ValueError("Conv2d weight should be 4-dimensional") - c1, c2, h, w = weight.shape - if skip_every == h == w == 3: - # Indices (0..h*w) to skip for each input channel - per_chan_skip_indices = [ - range((i_chan + skip_offset) % skip_every, h * w, skip_every) - for i_chan in range(c2) - ] - # Indices (0..c2*h*w) for each output channel, created by adding i*h*w for ith channel. - skip_indices = torch.tensor([ - x + i * h * w for i, per_chan in enumerate(per_chan_skip_indices) - for x in per_chan - ]) - else: - # Indices (0..c2*h*w) to skip for each output channel - skip_indices = torch.arange(skip_offset, c2 * h * w, skip_every) - flat_weight = weight.reshape(c1, -1) - flat_weight[:, skip_indices] = 0 - interp_rate = 1 + (1 / (skip_every - 1) * interp_rate) - flat_weight *= interp_rate - return flat_weight.reshape_as(weight) - - def apply(self, module: HPVMConvBundle) -> HPVMConvBundle: - # Not copying weight tensor leads to memory leak - cloned_conv_w = module.weight.clone().detach() - module.weight.data = self.sample_conv_weight( - self.interp_rate, self.skip_every, self.skip_offset, cloned_conv_w - ) - return module - - -def quantize_256(tensor: torch.Tensor, range_min: float, range_max: float) -> torch.Tensor: - """Quantize a tensor so that only 256 unique float value exists.""" - quantize_range = 256 - input_range = range_max - range_min - mul = input_range / quantize_range - # Map tensor into [0, 256] range. - affined = (tensor - range_min) / mul - # Convert tensor to int and back to float so it will have - # 256 (actually 257!; following hpvm impl) unique float values [0, 256]. - # Then reverse affine it to the original range. - quanted = torch.floor(affined).to(torch.int).to(torch.float) - quanted_float = quanted * mul + range_min - # Clip tensor - return torch.clamp(quanted_float, range_min, range_max) - - -class PromiseSim(Approximation): - scaling_values = [0.75, 0.64, 0.336, 0.21, 0.168, 0.14, 0.11, 0.0784, 0.005] - - def __init__(self, noise_level: int): - super().__init__() - self.noise_level = noise_level - - @property - def deterministic(self) -> bool: - return False - - @property - def devtime(self) -> bool: - return False - - @property - def fp32(self) -> bool: - return False - - def is_less_approx(self, other: Approximation) -> Optional[bool]: - if isinstance(other, PromiseSim): - return self.noise_level > other.noise_level - return None - - def add_promise_noise(self, tensor: torch.Tensor): - scale = self.scaling_values[self.noise_level] - noise = torch.normal( - mean=0.0, std=scale, size=tensor.size(), device=tensor.device - ) - return noise * tensor + tensor - - class PromiseSimModule(Module): - def __init__(self, module: HPVMConvBundle, approx: 'PromiseSim'): - super().__init__() - self.input_r, weight_r, bias_r, self.output_r = module.conv_ranges - module.weight.data = quantize_256(module.weight, *weight_r) - if module.bias is not None: - module.bias.data = quantize_256(module.bias, *bias_r) - self.module = module - self.approx = approx - - def forward(self, input_: torch.Tensor) -> torch.Tensor: - # Quantize input, weight, bias (see __init__), and add noise to input. - input_ = quantize_256(input_, *self.input_r) - input_ = self.approx.add_promise_noise(input_) - output = self.module(input_) - # Then again, quantize output. - return quantize_256(output, *self.output_r) - - def apply(self, module: HPVMConvBundle) -> PromiseSimModule: - return self.PromiseSimModule(module, self) - - -class FP16Approx(Approximation): - def __init__(self): - super().__init__() - - @property - def deterministic(self) -> bool: - return True - - @property - def devtime(self) -> bool: - return False - - @property - def fp32(self) -> bool: - return False - - def is_less_approx(self, other: Approximation) -> Optional[bool]: - return None - - class FP16ApproxModule(Module): - def __init__(self, module: Module): - super().__init__() - self.module = module.half() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - x: torch.Tensor = self.module(x.half()) - assert x.dtype == torch.float16 - return x.float() - - def apply(self, module: Module) -> FP16ApproxModule: - return self.FP16ApproxModule(module) - - -AllApproxesT = Dict[int, Approximation] -TypeApproxesT = Dict[Type[Module], List[int]] - - -class AvailableApproximations: - r"""Holds a list of all available "approximation info": approximation + properties. - - For properties see `Approximation`. - - Parameters - ---------- - all_knobs: Dict[int, Approximation] - A dict from int index to (approximation, is_dev_time) pair. - Also see class function `from_global_knobs_file`. - - Attributes - ---------- - all_knobs : Dict[int, Approximation] - A mapping from approximation index to approximation info pair `(approximation, is_dev_time)`. - type_to_knobs : Dict[Type[Module], List[int]] - A mapping from network layer type (subtype of `torch.nn.Module`) to a list of indexes of - applicable approximations. Values of `type_to_knobs` are always valid keys in `all_knobs`. - """ - - def __init__(self, all_knobs: Dict[int, Approximation], type_to_knobs: TypeApproxesT): - self.all_knobs = all_knobs - self.type_to_knobs = type_to_knobs - - @classmethod - def from_global_knobs_file(cls) -> 'AvailableApproximations': - """Read and parse global_knobs.txt to provide all knobs supported and their indexes. - - Returns two things: - * Dict of indexes to (approximations, is_dev_time). Approximation is in the form of functions - with a layer input; see `ModuleReplacerT`. - * Dict of type of torch.nn.Module to a list of approximation indexes that can be applied to this - type of layer. - """ - with (get_tensorrt_dir() / 'autotuner/data/global_knobs.txt').open() as f: - lines = f.readlines() - all_knobs = {} - promise_and_fp16 = [] - for line in lines: - desc, knobs, _, _, _, _, _ = line.rstrip().split() - category, index = desc.split(',') - index = int(index) - if category in ('perf', 'perf_fp16'): - row, col, offset = [int(s) for s in knobs.split(',')] - if row > 1 and col > 1: - raise ValueError("Perforation on both row and column is not supported") - if col == 1: - direction_is_row, stride = True, row - else: - direction_is_row, stride = False, col - all_knobs[index] = PerforateConv2dStride( - direction_is_row, stride, offset, 'fp16' in category - ) - elif category in ('samp', 'samp_fp16'): - stride, offset, interp_rate = knobs.split(',') - stride, offset, interp_rate = int(stride), int(offset), float(interp_rate) - all_knobs[index] = Conv2dSampling( - stride, offset, interp_rate, 'fp16' in category - ) - elif category == 'swing_level': - all_knobs[index] = PromiseSim(index) - promise_and_fp16.append(index) - elif category == 'fp16': - all_knobs[index] = FP16Approx() - promise_and_fp16.append(index) - type_to_knobs = { - HPVMConvBundle: list(all_knobs.keys()), - Linear: promise_and_fp16 - } - return cls(all_knobs, type_to_knobs) - - def items(self, dev_time: bool, ignore_fp32: bool) -> Dict[Type[Module], List[int]]: - """Give a list of applicable approximations for each layer type. - - If dev_time is True, returns only devtime approximations, otherwise all approximations. - """ - - def remove_non_dev(type_to_knobs: TypeApproxesT) -> TypeApproxesT: - return { - k: [v for v in vs if self.all_knobs[v].devtime] - for k, vs in type_to_knobs.items() - } - - def remove_fp32(type_to_knobs: TypeApproxesT) -> TypeApproxesT: - return { - k: [v for v in vs if not self.all_knobs[v].fp32] - for k, vs in type_to_knobs.items() - } - - type_to_knobs_ = self.type_to_knobs - if dev_time: - type_to_knobs_ = remove_non_dev(type_to_knobs_) - if ignore_fp32: - type_to_knobs_ = remove_fp32(type_to_knobs_) - return type_to_knobs_ - - def __getitem__(self, item: int) -> Approximation: - """Returns the approximation info for given approximation index.""" - return self.all_knobs[item] diff --git a/hpvm/projects/pred_tuner/toolkit/estimators.py b/hpvm/projects/pred_tuner/toolkit/estimators.py deleted file mode 100644 index acd35331693c706df336a6e3a33d1c6098a6cb50..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/toolkit/estimators.py +++ /dev/null @@ -1,383 +0,0 @@ -import abc -import gc -import logging -import pickle -from math import sqrt -from pathlib import Path -from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple, TypeVar - -import numpy as np -import torch -from torch.nn import Module -from tqdm import tqdm, trange - -from models.domains import QoS, qos_stats -from .transform import ConfigT, NetApproxSelector - -ProfT = TypeVar('ProfT') -NetOutputT = TypeVar('NetOutputT') -QoST = Callable[[NetOutputT], QoS] -ThresholdEvalT = Callable[[NetOutputT], bool] -ExeT = Callable[[Module], NetOutputT] -KeyT = Tuple[int, int] -KVT = Tuple[KeyT, NetOutputT] -EstmT = Tuple[QoS, QoS] - -msg_logger = logging.getLogger(__name__) - - -class LinearEstimator(abc.ABC): - """Estimate QoS of a config by linearly adding "something" from each approximation of config, and - then applying QoS metric. - - That "something" could be QoS itself (see `LinearQoSEstimator`), or the direct tensor output from - the model (see `LinearTensorEstimator`). - In initialization phase, run the model for each 1-approximation config and store the quantity to - be linearly summed in a table. - - Parameters - ---------- - nas: NetApproxSelector - `NetApproxSelector` instance is used to select all 1-approximation configs and evaluate them. - qos: Callable[[torch.Tensor], float] - Quality of Service measure (such as accuracy). Takes model output tensor and returns QoS value. - independent_init: bool - If False, don't initialize self.profile_table, and wait for `coinit_estimators` to fill in - the profile. `coinit_estimators` must be manually called if `init_profile` is False. - - Attributes - ---------- - qos : Callable[[torch.Tensor], float] - Same as parameter `qos`. - baseline_profile : T - Profile value of the baseline model. - profile_table : Dict[KeyT, T] - A mapping from (`layer_idx`, `approx_idx`) to the profile value, with only this approximation - applied (in other words, with configuration ``{layer_idx: approx_idx}`` applied). - """ - - n_nondeterm_runs = 10 - - def __init__( - self, nas: NetApproxSelector, executor: ExeT, qos: QoST, - threshold_eval: ThresholdEvalT, confidence_level: float, - independent_init: bool = True, storage: Path = None - ): - self.nas = nas - self.qos = qos - self.executor = executor - self.storage = storage - self.baseline_profile: ProfT = self.get_baseline_profile() - self.profile_table: Dict[KeyT, ProfT] = {} - self.confidence_level = confidence_level - if independent_init: - for (k, i), output in self._get_all_outputs(nas, self.executor, threshold_eval, storage): - self.profile_table[k, i] = self.handle_output(output) - - @staticmethod - def _load_from_pickle(storage: Path) -> Iterator[KVT]: - if not storage.is_file(): - return - msg_logger.info(f"Found pickle at {storage}") - with storage.open('rb') as f: - while True: - try: - key, tensor = pickle.load(f) - yield key, tensor - except EOFError: - return - - @classmethod - def run_model(cls, nas: NetApproxSelector, config: ConfigT, executor: ExeT) -> torch.Tensor: - is_deterministic = nas.is_deterministic(config) - model = nas.apply_approx_by_config(config).module - if is_deterministic: - ret = executor(model).unsqueeze(0).cpu() - else: - assert cls.n_nondeterm_runs > 0 - ret = torch.stack([ - executor(model) - for _ in trange(cls.n_nondeterm_runs, leave=False) - ]).cpu() - gc.collect() - return ret - - @classmethod - def _get_all_outputs( - cls, nas: NetApproxSelector, executor: ExeT, - threshold_eval: ThresholdEvalT, storage: Path = None - ) -> Iterator[KVT]: - preloaded_acceptable = {} - if storage is not None: - bar = tqdm(cls._load_from_pickle(storage)) - for key, tensor in bar: - bar.set_postfix(key=key) - preloaded_acceptable[key] = threshold_eval(tensor) - yield key, tensor - - def evaluate(k: int, i: int) -> Tuple[bool, Optional[KVT]]: - if (k, i) in preloaded_acceptable: - msg_logger.debug(f"Key {(k, i)} is preloaded.") - return preloaded_acceptable[(k, i)], None - outputs = cls.run_model(nas, {k: i}, executor) - if storage is not None: - with storage.open('ab') as f: - pickle.dump(((k, i), outputs), f) - return threshold_eval(outputs), ((k, i), outputs) - - for key_outputs in nas.filter_approxes(evaluate): - # key_outputs is None means corresponding key has been preloaded (we can't see the key) - if key_outputs is None: - continue - yield key_outputs - - @classmethod - def coinit_estimators( - cls, nas: NetApproxSelector, executor: ExeT, threshold_eval: ThresholdEvalT, - *estm_insts: 'LinearEstimator', storage: Path = None - ): - for (k, i), output in cls._get_all_outputs(nas, executor, threshold_eval, storage): - for inst in estm_insts: - inst.profile_table[(k, i)] = inst.handle_output(output) - - @abc.abstractmethod - def get_baseline_profile(self) -> ProfT: - pass - - @abc.abstractmethod - def handle_output(self, outputs: torch.Tensor) -> ProfT: - pass - - @abc.abstractmethod - def estimate(self, config: ConfigT) -> EstmT: - pass - - -class LinearQoSEstimator(LinearEstimator): - """Estimate QoS of a config by linearly adding QoS value. See `LinearEstimator`. - - ProfT = Tuple[QoS(mean), QoS(std)] - NetOutputT = torch.Tensor - """ - - def estimate(self, config: ConfigT) -> EstmT: - baseline_mean: QoS = self.baseline_profile[0] - if not config: - return baseline_mean, baseline_mean - # N * 2 array - profiles = np.array([self.profile_table[kv] for kv in config.items()]) - profiles[:, 0] -= baseline_mean - estm_qos = profiles[:, 0].sum() + baseline_mean - estm_std = sqrt(np.sum(profiles[:, 1] ** 2)) - # We're hardcoding 95% confidence interval here. - assert self.confidence_level == 0.95 - normal_dist_95 = 1.644854 - r1, r2 = estm_qos, estm_qos - normal_dist_95 * estm_std - return float(r1), float(r2) - - def handle_output(self, outputs: torch.Tensor) -> Tuple[QoS, QoS]: - qoses = np.array([self.qos(o) for o in outputs]) - msg_logger.debug(f"Handled {qoses.mean(), qoses.std()}") - return qoses.mean(), qoses.std() - - def get_baseline_profile(self) -> Tuple[QoS, QoS]: - mean_qos = self.qos(self.run_model(self.nas, {}, self.executor)[0]) - return mean_qos, mean_qos.null() - - -class LinearCombEstimator(LinearEstimator): - """Estimate QoS of a config by linearly adding tensor output from network. See `LinearEstimator`. - - On estimation, sums over the delta in tensor output (compared to baseline output) for each - approximation, and then the baseline tensor output is added back. - This works as an estimation of tensor output for this configuration, which is then sent to QoS - metric to get the final QoS. - - QoST = float - ProfT = torch.Tensor (2 * n_inputs * n_classes) - NetOutputT = torch.Tensor (n_inputs * n_classes) - """ - - def estimate(self, config) -> EstmT: - if not config: - baseline_qos = self.qos(self.baseline_profile) - return baseline_qos, baseline_qos - # 4D tensor: n_approx * 2 * n_inputs * n_classes - profiles = torch.stack([self.profile_table[kv] for kv in config.items()]) - profiles -= self.baseline_profile - mean_tensor, confidence_tensor = profiles.sum(dim=0) + self.baseline_profile - estm_mean_qos = self.qos(mean_tensor) - estm_confidence_qos = self.qos(confidence_tensor) - return estm_mean_qos, estm_confidence_qos - - def handle_output(self, outputs: torch.Tensor) -> torch.Tensor: - if len(outputs) == 1: - return torch.stack((outputs[0], outputs[0])) - qoses = np.array([self.qos(o) for o in outputs]) - percentile_pos = int(self.n_nondeterm_runs * (1 - self.confidence_level)) - assert 0 <= percentile_pos < self.n_nondeterm_runs - mean_pos = np.searchsorted(qoses, qoses.mean(), 'right') - assert 0 <= mean_pos <= self.n_nondeterm_runs - if mean_pos == self.n_nondeterm_runs: - mean_pos = self.n_nondeterm_runs - 1 - return torch.stack((outputs[mean_pos], outputs[percentile_pos])) - - def get_baseline_profile(self) -> torch.Tensor: - return self.run_model(self.nas, {}, self.executor)[0] - - -class TrainableEstimator(LinearEstimator, abc.ABC): - """ - QoST = float - ProfT = ProfT - NetOutputT = torch.Tensor (n_inputs * n_classes) - """ - n_train_confs = 50 - weight_range = 0.8, 1.2, 20 - n_cold_start = 500 - accept_threshold = 5 - penalize_overestm = 1.0 - - def __init__( - self, nas: NetApproxSelector, executor: ExeT, qos: QoST, - threshold_eval: ThresholdEvalT, confidence_level: float, - independent_init: bool = True, storage: Path = None - ): - super().__init__(nas, executor, qos, threshold_eval, confidence_level, independent_init, storage) - self.r_cands = np.linspace(*self.weight_range) - self.r_error = np.zeros((len(self.r_cands), self.n_train_confs)) - self.r = self.weight_range[1] - self.trained_iters = 0 - self.cold_start = 0 - - def update_r(self): - mean_error = np.mean(self.r_error, axis=1) - best_idx = np.argmin(mean_error) - self.r = self.r_cands[best_idx] - if best_idx == len(mean_error) - 1 or best_idx == 0: - msg_logger.warning(f"Parameter value r = {self.r} has reached the boundary. Consider a larger range.") - - def get_qos_for_config(self, config: ConfigT) -> EstmT: - is_deterministic = self.nas.is_deterministic(config) - net = self.nas.apply_approx_by_config(config).module - n_runs = 1 if is_deterministic else self.n_nondeterm_runs - qoses = [self.qos(self.executor(net)) for _ in trange(n_runs, leave=False)] - mean_qos, qos_at_confidence, _ = qos_stats(qoses, confidence=self.confidence_level) - return mean_qos, qos_at_confidence - - @abc.abstractmethod - def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]: - pass - - def estimate(self, config) -> EstmT: - estm = self.real_estimate(config)[0] - if self.cold_start < self.n_cold_start: - self.cold_start += 1 - if self.cold_start % 50 == 0: - msg_logger.info(f"WeightedLinearCombEstimator cold start {self.cold_start} / {self.n_cold_start}") - return estm - if self.trained_iters >= self.n_train_confs: - return estm - log_info_freq = 10 - log_level = logging.INFO if self.trained_iters % log_info_freq == 0 else logging.DEBUG - msg_logger.log( - log_level, - f"{self.__class__} train iter {self.trained_iters} / {self.n_train_confs}" - ) - mean_qos, qos_at_confidence = self.get_qos_for_config(config) - estm_conf_qoses = np.array(self.real_estimate(config, rs=self.r_cands))[:, 1] - diff_conf_qoses = qos_at_confidence - estm_conf_qoses - old_r = self.r - self.r_error[:, self.trained_iters] = np.where( - diff_conf_qoses > 0, diff_conf_qoses * self.penalize_overestm, - -diff_conf_qoses - ) - self.trained_iters += 1 - self.update_r() - msg_logger.debug( - f"{self.__class__} real mean qos = {mean_qos}, real conf qos = {qos_at_confidence}, " - f"estm conf qos = {estm[1]}, r: {old_r} -> {self.r}" - ) - return mean_qos, qos_at_confidence - - -class WeightedLinearCombEstimator(TrainableEstimator, LinearCombEstimator): - """ - QoST = float - ProfT = torch.Tensor - NetOutputT = torch.Tensor (n_inputs * n_classes), logged - """ - - def __init__( - self, nas: NetApproxSelector, executor: ExeT, qos: QoST, - threshold_eval: ThresholdEvalT, confidence_level: float, - independent_init: bool = True, storage: Path = None - ): - log_qos = lambda x: qos(torch.exp(x)) - super().__init__(nas, executor, log_qos, threshold_eval, confidence_level, independent_init, storage) - - @staticmethod - def tensor_log(tensor: torch.Tensor) -> torch.Tensor: - # TODO: don't take log if there's no SoftMax layer. - eps = torch.ones_like(tensor) * 1e-10 - return torch.log(torch.max(tensor, eps)) - - def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]: - # 3D tensor: 2 * n_inputs * n_classes - if config: - estm_delta_output = torch.sum( - torch.stack([self.profile_table[kv] for kv in config.items()]) - self.baseline_profile, - dim=0 - ) - else: - n_in, n_out = self.baseline_profile.shape - estm_delta_output = torch.zeros(2, n_in, n_out) - rets = [] - rs = rs if rs is not None else [self.r] - for r in rs: - mean_tensor, confidence_tensor = estm_delta_output * r + self.baseline_profile - rets.append((self.qos(mean_tensor), self.qos(confidence_tensor))) - return rets - - def handle_output(self, outputs: torch.Tensor) -> torch.Tensor: - return LinearCombEstimator.handle_output(self, self.tensor_log(outputs)) - - def get_baseline_profile(self) -> torch.Tensor: - return self.tensor_log(LinearCombEstimator.get_baseline_profile(self)) - - -class WeightedLinearQoSEstimator(TrainableEstimator, LinearQoSEstimator): - """ - QoST = float - ProfT = torch.Tensor - NetOutputT = torch.Tensor (n_inputs * n_classes), logged - """ - - weight_range = 0.5, 5, 50 - - def estimate(self, config) -> EstmT: - ret = super().estimate(config) - msg_logger.debug(f"Config {config} -> estimation {ret}") - return ret - - def real_estimate(self, config, rs: Iterable[float] = None) -> List[EstmT]: - baseline_mean_qos = self.baseline_profile[0] - if config: - # N * 2 array - profiles = np.array([self.profile_table[kv] for kv in config.items()]) - profiles[:, 0] -= baseline_mean_qos - profiles[:, 0][profiles[:, 0] > 0] = 0 - estm_mean_qos_delta = profiles[:, 0].sum() - estm_std = sqrt(np.sum(profiles[:, 1] ** 2)) - else: - estm_mean_qos_delta = estm_std = 0.0 - rets = [] - rs = rs if rs is not None else [self.r] - for r in rs: - estm_mean_qos = float(estm_mean_qos_delta * r + baseline_mean_qos) - # We're hardcoding 95% confidence interval here. - assert self.confidence_level == 0.95 - normal_dist_95 = 1.644854 - estm_conf_qos = estm_mean_qos - normal_dist_95 * estm_std - rets.append((estm_mean_qos, estm_conf_qos)) - return rets diff --git a/hpvm/projects/pred_tuner/toolkit/indexing.py b/hpvm/projects/pred_tuner/toolkit/indexing.py deleted file mode 100644 index 27500c152ac5130f6df787f16f53e84c3099bcf6..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/toolkit/indexing.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Callable, Iterator, Optional, Set - -import torch -from torch.nn import Module, Sequential - -UnaryForwardT = Callable[[torch.Tensor], torch.Tensor] -ReplacedForwardT = Callable[[Module, UnaryForwardT, torch.Tensor], torch.Tensor] - - -class ModuleIndexer: - def __init__(self, module: Module, ignore_module: Callable[[Module], bool]): - self.module_to_index = {} - for i, submodule in enumerate(module.modules()): - if ignore_module(submodule): - continue - self.module_to_index[submodule] = i - self.index_to_module = {i: m for m, i in self.module_to_index.items()} - self.module = module - self.layer_parents = self.find_layers_parent_info(module, set(self.all_modules)) - - @staticmethod - def find_layers_parent_info(net: Module, layers: Set[Module]): - ret = {} - for name, submodule in net.named_children(): - if submodule in layers: - ret[submodule] = net, name - ret = {**ret, **ModuleIndexer.find_layers_parent_info(submodule, layers)} - return ret - - @property - def all_modules(self) -> Iterator[Module]: - return iter(self.module_to_index.keys()) - - def find(self, module: Module) -> Optional[int]: - return self.module_to_index.get(module, None) - - def __getitem__(self, item: int) -> Module: - return self.index_to_module[item] - - def __setitem__(self, key: int, value: Module): - old = self.index_to_module[key] - if value != old: - self.index_to_module[key] = value - self.module_to_index[value] = self.module_to_index[old] - self.module_to_index.pop(old) - parent, name = self.layer_parents[old] - self.layer_parents[value] = parent, name - self.layer_parents.pop(old) - parent.__setattr__(name, value) - - def __iter__(self) -> Iterator[Module]: - return self.all_modules - - def __len__(self): - return len(self.module_to_index) diff --git a/hpvm/projects/pred_tuner/toolkit/transform.py b/hpvm/projects/pred_tuner/toolkit/transform.py deleted file mode 100644 index f19554181a9bb9ac10ee9261cd908c2003f18d48..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/toolkit/transform.py +++ /dev/null @@ -1,186 +0,0 @@ -import copy -import logging -from collections import defaultdict -from typing import Callable, Dict, Generic, Iterator, List, Tuple, TypeVar - -from torch.nn import Module - -from .approxdnn import Approximation, AvailableApproximations -from .indexing import ModuleIndexer - -msg_logger = logging.getLogger(__name__) - - -T1 = TypeVar('T1') -T2 = TypeVar('T2') -TransformerCT = Callable[[int, T1], T2] - - -class StateCapturer(Module, Generic[T2]): - @staticmethod - def _id(_, x): - return x.clone().cpu().detach() - - def __init__(self, net_index: ModuleIndexer, state_transformer: TransformerCT = None): - super().__init__() - self.net_state: Dict[int, List[T2]] = defaultdict(list) - self.state_transformer = state_transformer or self._id - self.net_index = net_index - for submodule in net_index.module.modules(): - submodule.register_forward_hook(self.forward_hook) - self._output = None - - @property - def module(self): - return self.net_index.module - - @property - def output(self): - if self._output is None: - raise RuntimeError("Cannot get output before inference happens") - return self._output - - def forward_hook(self, module: Module, _, outputs): - module_idx = self.net_index.find(module) - if module_idx is None: - raise RuntimeError("Cannot find module; module may have changed externally") - self.net_state[module_idx].append(self.state_transformer(module_idx, outputs)) - - def forward(self, *args, **kwargs): - return self.module.forward(*args, **kwargs) - - def get_output_state(self) -> List[T2]: - return self.net_state[self.injected.output_loc()] - - -T = TypeVar('T') -ConfigT = Dict[int, int] -EvaluatorT = Callable[[int, int], Tuple[bool, T]] - - -class NetApproxSelector: - r"""List all 1-approximation configurations, and apply configurations to a `ModuleDAG` network. - - Computes a list of available approximations for each layer of the network, given info on available - approximations in the system (in the form of an `AvailableApproximations` instance). - Capable of listing all single-approximation configurations, and apply a given configuration to the network. - A configuration is a dict from layer indices to approximation for these layers, one for each. - See `ConfigT`. - - Parameters - ---------- - net : Module - The network to be approximated. - dev_time_only : bool - If True, use only devtime approximations; otherwise use all available approximations. - aa : AvailableApproximations - A container with information of available approximations, and the type of layer each approximation - applies to, etc. - - Attributes - ---------- - net : Module - The network to be approximated (parameter `net`). - net_approxes: Dict[int, List[int]] - A list of available approximation indexes per layer index. - available_approx: AvailableApproximations - Available approximations (parameter `aa`). - """ - - class ApproximationGraph: - """Naive O(n^2) sort for a list of partially-ordered approximations.""" - - def __init__(self, approx_indices: List[int], aa: AvailableApproximations): - import networkx as nx - self.dep_graph = nx.DiGraph() - self.dep_graph.add_nodes_from(approx_indices) - for i, x in enumerate(approx_indices): - for y in approx_indices[i + 1:]: - approx_x, approx_y = aa[x], aa[y] - cmp = approx_x.is_less_approx(approx_y) - if cmp is None: # Not comparable - continue - if cmp: - self.dep_graph.add_edge(x, y) - else: - self.dep_graph.add_edge(y, x) - self.sorted_indices = list(nx.algorithms.topological_sort(self.dep_graph)) - - def __len__(self) -> int: - return len(self.sorted_indices) - - def __iter__(self) -> Iterator[Tuple[int, bool]]: - return iter(self.sorted_indices) - - def __init__( - self, net: Module, dev_time_only: bool = True, ignore_fp32: bool = False, - aa: AvailableApproximations = None - ): - self.available_approx = aa or AvailableApproximations.from_global_knobs_file() - self.type_approxes = self.available_approx.items(dev_time=dev_time_only, ignore_fp32=ignore_fp32) - approximable_types = tuple(self.type_approxes.keys()) - self.net_index = ModuleIndexer(net, lambda m: not isinstance(m, approximable_types)) - self.dev_time_only = dev_time_only - self.net_approxes: Dict[int, List[int]] = defaultdict(list) - for i, layer in self.net_index.index_to_module.items(): - for t, approxes in self.type_approxes.items(): - if isinstance(layer, t): - self.net_approxes[i].extend(approxes) - - def apply_approx_by_config(self, config: ConfigT) -> ModuleIndexer: - """Applies given `config` to network.""" - new_dag = copy.deepcopy(self.net_index) - for layer_idx, config_idx in config.items(): - layer = new_dag[layer_idx] - new_dag[layer_idx] = self.available_approx[config_idx].apply(layer) - return new_dag - - def list_single_approxes(self) -> Iterator[Tuple[int, int, Approximation]]: - for k, vs in self.net_approxes.items(): - for v in vs: - yield k, v, self.available_approx[v] - - def filter_approxes(self, evaluator: EvaluatorT) -> Iterator[T]: - """Enumerate through and apply each single-approximation configuration.""" - net_approxes_graph: Dict[int, NetApproxSelector.ApproximationGraph] = { - k: self.ApproximationGraph(vs, self.available_approx) for k, vs in self.net_approxes.items() - } - from tqdm import tqdm - from utils import gpu_mem_mb - bar1 = tqdm(net_approxes_graph.items(), total=len(net_approxes_graph)) - for k, graph in bar1: - bar1.set_postfix(layer=k) - bar2 = tqdm(graph, leave=None) - unacceptable_approx = None - filtered_layer_approxes = [] - for approx_id in bar2: - approx = self.available_approx[approx_id] - if unacceptable_approx is not None: - cmp = unacceptable_approx.is_less_approx(approx) - if cmp: - msg_logger.debug(f"{approx} is worse than unacceptable approx {unacceptable_approx}") - continue - else: - unacceptable_approx = None - bar2.set_postfix(approx_id=approx_id, mem=gpu_mem_mb()) - acceptable, ret_val = evaluator(k, approx_id) - if not acceptable: - unacceptable_approx = approx - msg_logger.debug(f"{approx} is unacceptable") - continue - filtered_layer_approxes.append(approx_id) - yield ret_val - self.net_approxes[k] = filtered_layer_approxes - - def get_baseline(self) -> Module: - return self.net_index.module - - def get_layer_approxes(self) -> Dict[Module, List[int]]: - """Expose available knobs for autotuner usage.""" - return { - self.net_index[layer_k]: approxes - for layer_k, approxes in self.net_approxes.items() - } - - def is_deterministic(self, config: ConfigT): - return all(self.available_approx[knob_id].deterministic for knob_id in config.values()) diff --git a/hpvm/projects/pred_tuner/utils/__init__.py b/hpvm/projects/pred_tuner/utils/__init__.py deleted file mode 100644 index 1f06b4ae222c3a8a56d4ab4516031e4c91dfa0d2..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/utils/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .config import Config -from .logging import config_pylogger, reapply_last_config -from .utils import device, get_knob_config_file, get_tensorrt_dir, gpu_mem_mb diff --git a/hpvm/projects/pred_tuner/utils/benchmarks.json b/hpvm/projects/pred_tuner/utils/benchmarks.json deleted file mode 100644 index 57184872a07de661c1c9ee4064ec01652e9966ff..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/utils/benchmarks.json +++ /dev/null @@ -1,100 +0,0 @@ -{ - "lenet_hpvm": { - "model_name": "lenet_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/lenet_keras/", - "layer_file": "autotuner/data/lenet/lenet_layers.txt", - "cost_file": "autotuner/data/lenet/op_cost.txt" - }, - "alexnet_hpvm": { - "model_name": "alexnet_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/alexnet_cifar10/", - "layer_file": "autotuner/data/alexnet/alexnet_layers.txt", - "cost_file": "autotuner/data/alexnet/op_cost.txt" - }, - "alexnet2_hpvm": { - "model_name": "alexnet2_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/alexnet2_cifar10/", - "layer_file": "autotuner/data/alexnet2/alexnet2_layers.txt", - "cost_file": "autotuner/data/alexnet2/op_cost.txt" - }, - "vgg16_cifar10_hpvm": { - "model_name": "vgg16_cifar10_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/vgg16_cifar10/", - "layer_file": "autotuner/data/vgg16_cifar10/vgg16_layers.txt", - "cost_file": "autotuner/data/vgg16_cifar10/op_cost.txt" - }, - "vgg16_cifar100_hpvm": { - "model_name": "vgg16_cifar100_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/vgg16_cifar100/", - "layer_file": "autotuner/data/vgg16_cifar100/vgg16_layers.txt", - "cost_file": "autotuner/data/vgg16_cifar100/op_cost.txt" - }, - "vgg16_imagenet_hpvm": { - "model_name": "vgg16_imagenet_hpvm", - "autotuner_runs": 20000, - "base_dir": "tuner_results/vgg16_imagenet/", - "layer_file": "autotuner/data/vgg16_imagenet/vgg16_layers.txt", - "cost_file": "autotuner/data/vgg16_imagenet/op_cost.txt" - }, - "resnet18_hpvm": { - "model_name": "resnet18_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/resnet18_cifar10/", - "layer_file": "autotuner/data/resnet/resnet_layers.txt", - "cost_file": "autotuner/data/resnet/op_cost.txt" - }, - "resnet50_imagenet_hpvm": { - "model_name": "resnet50_imagenet_hpvm", - "autotuner_runs": 30000, - "base_dir": "tuner_results/resnet50_imagenet/", - "layer_file": "autotuner/data/resnet50_imagenet/resnet50_layers.txt", - "cost_file": "autotuner/data/resnet50_imagenet/op_cost.txt" - }, - "mobilenet_hpvm": { - "model_name": "mobilenet_hpvm", - "autotuner_runs": 20000, - "base_dir": "tuner_results/mobilenet/", - "layer_file": "autotuner/data/mobilenet/mobilenet_layer_comp.txt", - "cost_file": "autotuner/data/mobilenet/op_cost.txt" - }, - "__unused_mobilenet_shallow": { - "model_name": "mobilenet_shallow_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/mobilenet_shallow/", - "layer_file": "autotuner/data/mobilenet_shallow/mobilenet_shallow_layer_comp.txt", - "cost_file": "autotuner/data/mobilenet_shallow/op_cost.txt" - }, - "alexnet_imagenet_hpvm": { - "model_name": "alexnet_imagenet_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/alexnet_imagenet/", - "layer_file": "autotuner/data/alexnet_imagenet/layer_composition.txt", - "cost_file": "autotuner/data/alexnet_imagenet/op_cost.txt" - }, - "alexnet2_canny_hpvm": { - "model_name": "alexnet2_canny_hpvm", - "autotuner_runs": 10000, - "base_dir": "tuner_results/alexnet2_canny_hpvm/", - "layer_file": "autotuner/data/alexnet2_canny_hpvm/layers.txt", - "cost_file": "autotuner/data/alexnet2_canny_hpvm/op_cost.txt" - }, - "resnet18_torch": { - "model_name": "resnet18_torch", - "autotuner_runs": 10000, - "base_dir": "tuner_results/resnet18_cifar10_torch/", - "layer_file": "autotuner/data/resnet18_torch/resnet_layers.txt", - "cost_file": "autotuner/data/resnet18_torch/op_cost.txt" - }, - "vgg16_torch": { - "model_name": "vgg16_torch", - "autotuner_runs": 10000, - "base_dir": "tuner_results/resnet18_cifar10_torch/", - "layer_file": "autotuner/data/resnet/resnet_layers.txt", - "cost_file": "autotuner/data/resnet/op_cost.txt" - } -} \ No newline at end of file diff --git a/hpvm/projects/pred_tuner/utils/config.py b/hpvm/projects/pred_tuner/utils/config.py deleted file mode 100644 index fced1a4d462ad9bb4c828f2bbc264bb4b4755081..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/utils/config.py +++ /dev/null @@ -1,318 +0,0 @@ -from pathlib import Path -from typing import Dict, Iterable, List, Union - -import matplotlib.pyplot as plt -import numpy as np - -from models.domains import QoS -from models.domains.qoses import Accuracy, AccuracyPSNR -from .utils import get_knob_config_file - -op_mapping = { - "conv": "conv", "depthwise_conv": "group_conv", "dense": "mul", "batchnorm": "batchnorm", - "pool": "pool_max", "pool_mean": "pool_mean", "activation": "relu", "tanh": "tanh", "add": "add", - "reduce": "red_samp" -} - -approx_map = {} -PathLike = Union[str, Path] - - -def initializeApproxMap(knobs_file_path): - f = open(knobs_file_path, "r") - - for x in f: - toks = x.split("\t") - approx_type = toks[0].split(",")[0] - knob_id = toks[0].split(",")[1] - approx_str = approx_type + " " + knob_id - approx_map[knob_id] = approx_str - - -initializeApproxMap(get_knob_config_file()) - -# TODO: fix hardcoding -fp32_to_fp16 = { - **{k: k + 30 for k in range(121, 138 + 1)}, - **{k: k + 30 for k in range(231, 248 + 1)}, - 11: 12 -} -fp16_to_fp32 = {v: k for k, v in fp32_to_fp16.items()} - - -class Config: - def __init__( - self, avg_accuracy: QoS, baseline_accuracy: QoS, fname: str, flags: List[int], - total_runs: int, confidence: float, config_cost: float, speedup: float - ): - self.total_runs = total_runs - self.confidence = confidence - self.config_cost = config_cost - self.speedup = speedup - self.avg_qos = avg_accuracy - self.baseline_qos = baseline_accuracy - self.fname = fname - self.flags = flags - self.avg_loss = self.avg_loss.min_positive_loss() - - @property - def avg_loss(self): - return self.baseline_qos - self.avg_qos - - @avg_loss.setter - def avg_loss(self, value: QoS): - self.avg_qos = self.baseline_qos - value - - def __repr__(self): - return repr((self.fname, self.speedup, self.avg_qos, self.avg_loss, self.flags)) - - @staticmethod - def qos_speedup_points(configs: Iterable['Config']) -> np.ndarray: - return np.array([[*conf.avg_qos.numpy(), conf.speedup] for conf in configs]) - - def update_acc(self, acc: QoS, confidence: float, baseline_acc: QoS = None): - if baseline_acc: - self.baseline_qos = baseline_acc - self.avg_qos = acc - self.avg_loss = self.avg_loss.min_positive_loss() - self.confidence = confidence - - def to_fp16(self) -> 'Config': - import copy - fp16_conf = copy.copy(self) - fp16_conf.flags = [fp32_to_fp16.get(x, x) for x in self.flags] - return fp16_conf - - def to_fp32(self) -> 'Config': - import copy - fp32_conf = copy.copy(self) - fp32_conf.flags = [fp16_to_fp32.get(x, x) for x in self.flags] - return fp32_conf - - def to_rt_format(self, idx: int, bench_layer_composition, hardware_target: str): - config_str = build_config_str(self.flags, bench_layer_composition, hardware_target) - return ( - "+++++\n" - f"conf{idx} {self.speedup} 0 {self.avg_qos} {self.avg_loss}\n" - f"{config_str}" - "-----\n" - ) - - def to_tuner_format(self): - topline = ( - f"total_runs={self.total_runs}\tconfidence={self.confidence}\t" - f"avg_accuracy={self.avg_qos}\tconfig_cost={self.config_cost}\tspeedup={self.speedup}" - ) - flags_lines = [str(x) for x in self.flags] - return '\n'.join([topline] + flags_lines) - - @classmethod - def from_tuner_format(cls, lines: List[str], fname: str, baseline_accuracy: QoS): - def parseTopLine(x: str) -> Dict[str, str]: - toks = x.split() - fields = {} - for tok in toks: - field, value = tok.split('=') - fields[field] = value - return fields - - top_line = parseTopLine(lines[0]) - total_runs = int(top_line['total_runs']) - confidence = float(top_line['confidence']) - avg_accuracy = baseline_accuracy.parse(top_line['avg_accuracy']) - config_cost = float(top_line['config_cost']) - speedup = float(top_line['speedup']) - flags = [int(line.strip()) for line in lines[1:] if line.strip()] - return cls(avg_accuracy, baseline_accuracy, fname, flags, total_runs, confidence, config_cost, speedup) - - -def genScatterPlotFromConfigs(configs, file_path): - speedups, accuracy_losses = [c.speedup for c in configs], [c.avg_loss for c in configs] - plt.scatter(accuracy_losses, speedups) - plt.xlabel("accuracy_loss") - plt.ylabel("speedup") - plt.xlim(left=-0.05) - plt.ylim(bottom=1) - plt.savefig(file_path) - plt.close() - - -def _find_distance_to(points: np.ndarray, ref_points: np.ndarray) -> np.ndarray: - n_ref = len(ref_points) - if n_ref == 0: - return np.zeros(0) - if n_ref == 1: - return np.linalg.norm(points - ref_points, axis=1) - ref_points = np.array(sorted(ref_points, key=lambda p: p[0])) - px = points.T[0] - rx = ref_points.T[0] - local_unit_vecs = ref_points[1:] - ref_points[:-1] - dists = [] - bins = np.digitize(px, rx) - 1 - for point, left_ref_p in zip(points, bins): - if left_ref_p == -1: - left_ref_p = 0 - to_left_ref = ref_points[left_ref_p] - point - local_unit_vec = local_unit_vecs[-1] if left_ref_p >= n_ref - 1 else local_unit_vecs[left_ref_p] - projection = np.dot(local_unit_vec, to_left_ref) / np.linalg.norm(local_unit_vec) - dist = np.sqrt(np.linalg.norm(to_left_ref) ** 2 - projection ** 2) - dists.append(dist) - return np.array(dists) - - -def is_pareto_efficient( - configs: List[Config], margin: float = None, - ratio: float = None, n_min: int = None, n_max: int = None -) -> List[Config]: - configs = np.array(configs) - acc_speedup = Config.qos_speedup_points(configs) - is_efficient = np.ones(acc_speedup.shape[0], dtype=bool) - for idx, c in enumerate(acc_speedup): - if is_efficient[idx]: - # Keep any point with a higher value - is_efficient[is_efficient] = np.any(acc_speedup[is_efficient] > c, axis=1) - is_efficient[idx] = True # And keep self - pareto_acc_speedup = acc_speedup[is_efficient] - pareto_configs = configs[is_efficient] - non_pareto_acc_speedup = acc_speedup[np.logical_not(is_efficient)] - non_pareto_configs = configs[np.logical_not(is_efficient)] - dist_to_pareto = _find_distance_to(non_pareto_acc_speedup, pareto_acc_speedup) - if margin is not None: - marginal_accepted = non_pareto_configs[dist_to_pareto < margin] - elif ratio is not None: - dist_order = np.argsort(dist_to_pareto) - take_n = int(len(dist_to_pareto) * ratio) - if n_min is not None: - take_n = max(take_n, n_min) - if n_max is not None: - take_n = min(take_n, n_max) - take_n -= len(pareto_configs) - marginal_accepted = non_pareto_configs[dist_order[:take_n]] - else: - raise ValueError("Must provide margin or ratio") - return pareto_configs.tolist() + marginal_accepted.tolist() - - -def print_layer_info(flag: int, hardware_target: str, layer_comp): - approx_tech = approx_map[str(flag)] - if flag <= 7: - # If is PROMISE - return f"promise {approx_tech}" - # If is GPU / CPU - op0 = op_mapping[layer_comp[0]] - config_str = f"{hardware_target} {op0} {approx_tech} " - for op in layer_comp[1:]: - op_name = op_mapping[op] - fp = "fp32" if is_fp32(flag) else "fp16" - config_str += f"{op_name} {fp} 1 " - return config_str - - -def build_config_str(flags: List[int], layer_desc: List[List[str]], hardware_target: str): - lines = [] - assert len(flags) == len(layer_desc) - for index, (flag, layer_comp) in enumerate(zip(flags, layer_desc), start=1): - layer_str = print_layer_info(flag, hardware_target, layer_comp) - config_str = f"{index} {layer_str}" - lines.append(config_str) - lines.append(f"{len(layer_desc) + 1} {hardware_target} softmax fp32 1\n") - return '\n'.join(lines) - - -def is_fp32(flag: int): - return flag in fp32_to_fp16 - - -def dump_configs_to_rt( - layer_desc, configs: List[Config], - config_out_path: PathLike, baseline_acc: QoS, hardware_target: str -): - baseline_flag = 11 - baseline_config = Config( - baseline_acc, baseline_acc, '', [baseline_flag for _ in layer_desc], - 1, 100.0, 0.0, 1.0 - ) - baseline_str = baseline_config.to_rt_format(1, layer_desc, hardware_target) - with config_out_path.open("w") as f: - f.write(baseline_str) - for it, config in enumerate(configs, start=2): - f.write(config.to_rt_format(it, layer_desc, hardware_target)) - - -# Public Interfaces -def dump_rt_format_to( - layer_desc, configs: List[Config], gold_acc: QoS, - rt_cpu_path: PathLike = None, rt_gpu_path: PathLike = None -): - if configs: - assert len(set([conf.baseline_qos for conf in configs])) == 1 - # Sort configs - sorted_configs = sorted(configs, key=lambda conf: (conf.avg_loss, conf.speedup, conf.flags)) - if rt_gpu_path is not None: - # Remap to fp16 for gpu. - fp16_configs = [conf.to_fp16() for conf in sorted_configs] - dump_configs_to_rt( - layer_desc, fp16_configs, rt_gpu_path, gold_acc, 'gpu' - ) - if rt_cpu_path is not None: - # Remap to fp32 for cpu. - fp32_configs = [conf.to_fp32() for conf in sorted_configs] - dump_configs_to_rt( - layer_desc, fp32_configs, rt_cpu_path, gold_acc, 'cpu' - ) - - -def plot_configs(file_path: Path, **kw_configs: List[Config]): - from mpl_toolkits.mplot3d import Axes3D - # Decide 2D or 3D plot: - qos_type = None - for label, confs in kw_configs.items(): - if not confs: - continue - if not qos_type: - qos_type = type(confs[0].avg_qos) - else: - assert qos_type == type(confs[0].avg_qos) - if qos_type is None: - return - if qos_type is AccuracyPSNR: - fig: plt.Figure = plt.figure() - ax: Axes3D = fig.add_subplot(111, projection='3d') - for label, confs in kw_configs.items(): - data = np.array([ - [c.avg_loss.qoses[0].to_scalar(), c.avg_qos.qoses[1].to_scalar(), c.speedup] - for c in confs] - ) - x, y, z = data.T - ax.scatter(x, y, z, label=label) - ax.set_xlabel("accuracy_loss") - ax.set_ylabel("psnr") - ax.set_zlabel("speedup") - ax.set_xlim(left=-0.05) - ax.set_zlim(bottom=1) - elif qos_type is Accuracy: - fig, ax = plt.subplots() - fig: plt.Figure - ax: plt.Axes - for label, confs in kw_configs.items(): - data = np.array([[c.avg_loss.to_scalar(), c.speedup] for c in confs]) - x, y = data.T - ax.scatter(x, y, label=label) - ax.set_xlabel("accuracy_loss") - ax.set_ylabel("speedup") - ax.set_xlim(left=-0.05) - ax.set_ylim(bottom=1) - else: - raise ValueError(f"QoS type {qos_type} unsupported in plotting.") - ax.legend() - fig.savefig(file_path) - plt.close(fig) - - -def load_configs_from_dir(result_dir: PathLike, baseline_accuracy: QoS): - config_arr = [] - for path in Path(result_dir).glob('*'): - with path.open() as f: - lines = f.readlines() - config_arr.append(Config.from_tuner_format(lines, path.name, baseline_accuracy)) - return config_arr diff --git a/hpvm/projects/pred_tuner/utils/logging.py b/hpvm/projects/pred_tuner/utils/logging.py deleted file mode 100644 index 6b6904bd2e0a0683ccc6905994f645fa6856ad4d..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/utils/logging.py +++ /dev/null @@ -1,87 +0,0 @@ -import logging -from logging import config -import os -from pathlib import Path - -import tqdm - - -class TqdmStreamHandler(logging.Handler): - """tqdm-friendly logging handler. Uses tqdm.write instead of print for logging.""" - - def __init__(self, level=logging.NOTSET): - super().__init__(level) - - def emit(self, record): - try: - msg = self.format(record) - tqdm.tqdm.write(msg) - self.flush() - except (KeyboardInterrupt, SystemExit, RecursionError): - raise - except: - self.handleError(record) - - -_last_applied_config = None - - -def config_pylogger(filename: str = None, output_dir: Path = None, verbose: bool = False) -> logging.Logger: - """Configure the Python logger. - - For each execution of the application, we'd like to create a unique log file. - By default this file is named using the date and time of day, so that it can be sorted by recency. - You can also name your filename or choose the log directory. - """ - import time - timestr = time.strftime("%Y.%m.%d-%H%M%S") - filename = filename or timestr - output_dir = output_dir or Path('.') - if not os.path.exists(output_dir): - os.makedirs(output_dir) - file_path = output_dir / filename - - global _last_applied_config - _last_applied_config = d = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'simple': { - 'format': '%(levelname)s %(name)s: ' - '%(message)s' - }, - 'detailed': { - 'format': '[%(asctime)-15s] ' - '%(levelname)7s %(name)s: ' - '%(message)s ' - '@%(filename)s:%(lineno)d' - } - }, - 'handlers': { - 'console': { - '()': TqdmStreamHandler, - 'level': 'INFO', - 'formatter': 'simple' - }, - 'file': { - 'class': 'logging.FileHandler', - 'filename': file_path.as_posix(), - 'mode': 'a', # Because we may apply this config again, want to keep existing content - 'formatter': 'detailed', - }, - }, - 'root': { - 'level': 'DEBUG' if verbose else 'INFO', - 'handlers': ['console', 'file'] - }, - } - config.dictConfig(d) - - msglogger = logging.getLogger() - msglogger.info(f"Log file for this run: {file_path}") - return msglogger - - -def reapply_last_config(): - if _last_applied_config is not None: - config.dictConfig(_last_applied_config) diff --git a/hpvm/projects/pred_tuner/utils/utils.py b/hpvm/projects/pred_tuner/utils/utils.py deleted file mode 100644 index 16165574662ca91320784f827468002fbae21fa8..0000000000000000000000000000000000000000 --- a/hpvm/projects/pred_tuner/utils/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -import logging -import os -from pathlib import Path - -import torch - -device = f'cuda:{torch.cuda.device_count() - 1}' if torch.cuda.is_available() else 'cpu' -n_cpu_threads = 12 if device == 'cuda:0' else 35 -torch.set_num_threads(n_cpu_threads) - -msg_logger = logging.getLogger(__name__) - - -def gpu_mem_mb(): - # noinspection PyTypeChecker - return torch.cuda.memory_allocated(device) / 1024 ** 2 - - -def get_tensorrt_dir() -> Path: - if 'LLVM_SRC_ROOT' not in os.environ: - return Path('.') - return Path(os.environ['LLVM_SRC_ROOT']) / "projects/hpvm-tensor-rt" - - -def get_knob_config_file() -> Path: - return get_tensorrt_dir() / "autotuner/data/global_knobs.txt" diff --git a/hpvm/projects/predtuner b/hpvm/projects/predtuner new file mode 160000 index 0000000000000000000000000000000000000000..70ead4a70536ec7af29a99658a9e207b6e16d230 --- /dev/null +++ b/hpvm/projects/predtuner @@ -0,0 +1 @@ +Subproject commit 70ead4a70536ec7af29a99658a9e207b6e16d230 diff --git a/hpvm/projects/torch2hpvm/setup.py b/hpvm/projects/torch2hpvm/setup.py index ae103a2cdf0c0872278c147ddac5774ce79da452..f0cd851e586cf4d35c856ead11915f97c7654901 100644 --- a/hpvm/projects/torch2hpvm/setup.py +++ b/hpvm/projects/torch2hpvm/setup.py @@ -7,6 +7,8 @@ setup( author="Yuanjing Shi, Yifan Zhao", author_email="ys26@illinois.edu, yifanz16@illinois.edu", packages=["torch2hpvm"], - install_requires=["jinja2>=2.11", "networkx>=2.5", "onnx>=1.8.0", "torch"], + install_requires=[ + "jinja2>=2.11", "networkx>=2.5", "onnx>=1.8.0", "torch", "onnx-simplifier>=0.2.27" + ], entry_points={"console_scripts": ["torch2hpvm=torch2hpvm:main"]}, ) diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json b/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json index 974b536c48cd1d5ab96120cfd0c5e9510846df17..9d7cb28a8b3fcc2301735c21e99119beb5a89907 100644 --- a/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json +++ b/hpvm/projects/torch2hpvm/torch2hpvm/approxknobs.json @@ -2,7 +2,8 @@ { "name": "11", "speedup": 1.0, - "applies_to": null + "applies_to": null, + "is_baseline": true }, { "name": "12", diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py b/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py index cdba5f327f54f1b77889f34d51c6df54ad86786a..6f6b71eae0deda9176c3dcb32c76c99bccbf5f07 100644 --- a/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py +++ b/hpvm/projects/torch2hpvm/torch2hpvm/codegen_hpvm.py @@ -6,10 +6,10 @@ import jinja2 from .graph_builder import DFG from .graph_ir import DFGNode, TensorNode, WeightTensor -TEMPLATE_FILE = "template_hpvm.cpp.in" +PLAIN_TEMPLATE_FILE = "template_hpvm.cpp.in" +INSPECT_TEMPLATE_FILE = "template_hpvm_inspect.cpp.in" loader = jinja2.FileSystemLoader(searchpath=Path(__file__).parent) template_env = jinja2.Environment(loader=loader, trim_blocks=True) -template = template_env.get_template(TEMPLATE_FILE) PathLike = Union[str, Path] @@ -69,11 +69,22 @@ class HpvmCodeGen(CodeGen): # Variable indicator is always int for hpvm gen variables: Dict[DFGNode, Tuple[int, bool]] - def __init__(self, dfg: DFG, prefix: PathLike, input_size: int, target: str): + def __init__( + self, + dfg: DFG, + prefix: PathLike, + input_size: int, + target: str, + inspectable: Optional[dict], + ): super().__init__(dfg, prefix, input_size) if target not in ("tensor", "cudnn"): raise ValueError(f"Unsupported target {target}") self.target = target + self.template = template_env.get_template( + PLAIN_TEMPLATE_FILE if inspectable is None else INSPECT_TEMPLATE_FILE + ) + self.inspect_vars = inspectable or {} def _emit_hpvm_node_edges(self, input_vars: List[DFGNode]) -> List[dict]: ret = [] @@ -133,7 +144,7 @@ class HpvmCodeGen(CodeGen): weights = self.emit_weights(self.weights) with Path(output).open("w") as f: f.write( - template.render( + self.template.render( nodes=nodes, input_name=self.input_name, input_size=self.input_size, @@ -144,5 +155,6 @@ class HpvmCodeGen(CodeGen): weights=weights, prefix=self.prefix, target=self.target, + **self.inspect_vars ) ) diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py index cc2a670dad75661a296dcb4465a8de56358630b5..f0d8c3b131231d637429c40b7e68a94627ebd6bf 100644 --- a/hpvm/projects/torch2hpvm/torch2hpvm/compile.py +++ b/hpvm/projects/torch2hpvm/torch2hpvm/compile.py @@ -34,6 +34,9 @@ class ModelExporter: weight_dir_name = "weights" source_file_name = "hpvm_c.cpp" metadata_file_name = "ops.json" + config_file_name = "tuner_confs.txt" + fifo_file_name_r = "hpvm_fifo_r" + fifo_file_name_w = "hpvm_fifo_w" def __init__( self, @@ -43,39 +46,61 @@ class ModelExporter: output_dir: PathLike, target: str = "hpvm_tensor", opset: Optional[int] = None, + config_file: PathLike = None, ): - from onnxsim import simplify - self.tune_dataset, self.test_dataset = tune_dataset, test_dataset self.dataset_shape = self._check_datasets(tune_dataset, test_dataset) self.dataset_size = self.dataset_shape[0] - onnx_model = self._load_model(model, self.dataset_shape) - if opset is not None: - onnx_model = check_onnx_version(onnx_model, opset) - onnx_model, check = simplify(onnx_model) - assert check, "Simplified ONNX model could not be validated" - onnx_model = onnx.shape_inference.infer_shapes(onnx_model) - + onnx_model = self._load_model(model, self.dataset_shape, opset) self.dfg = DFG(onnx_model.graph) - self.output_dir = Path(output_dir) + + output_dir = Path(output_dir).absolute() os.makedirs(output_dir, exist_ok=True) - self.weight_dir = self.output_dir / self.weight_dir_name + self.weight_dir = output_dir / self.weight_dir_name self.weight_dir.mkdir(exist_ok=True) + self.codefile = output_dir / self.source_file_name + self.metafile = output_dir / self.metadata_file_name + args3 = self.dfg, self.weight_dir, self.dataset_size + self.compile_args = None + self.path_params = {} if target == "hpvm_tensor": - self.codegen = HpvmCodeGen(self.dfg, self.weight_dir, self.dataset_size, "tensor") + if config_file is None: + raise ValueError( + f"Config file must be given and exist under hpvm_tensor mode" + ) + self.path_params = {"config_file": Path(config_file)} + self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)] + self.codegen = HpvmCodeGen(*args3, "tensor", None) + elif target == "hpvm_tensor_inspect": + if config_file is None: + config_file = output_dir / self.config_file_name + else: + config_file = Path(config_file).absolute() + self.path_params = { + "tune_labels_path": (self.weight_dir / self.tuneset_name[1]).as_posix(), + "conf_path": config_file.as_posix(), + "fifo_path_r": (output_dir / self.fifo_file_name_r).as_posix(), + "fifo_path_w": (output_dir / self.fifo_file_name_w).as_posix() + } + self.compile_args = ["-t", "tensor", "--conf-file", str(config_file)] + self.codegen = HpvmCodeGen(*args3, "tensor", self.path_params) elif target == "hpvm_cudnn": - self.codegen = HpvmCodeGen(self.dfg, self.weight_dir, self.dataset_size, "cudnn") + self.compile_target = "cudnn" + self.compile_args = ["-t", "cudnn"] + self.codegen = HpvmCodeGen(*args3, "cudnn", None) elif target == "tensor": - self.codegen = TensorCodeGen(self.dfg, self.weight_dir, self.dataset_size) + self.codegen = TensorCodeGen(*args3) else: raise ValueError(f"Target {target} not recognized") def export_source_code(self, output: PathLike, batch_size: Optional[int] = None): self.codegen.compile(output, batch_size) + return self def export_weights(self): self.dfg.dump_weights(self.weight_dir) + return self def export_datasets(self): input_, labels = self.tuneset_name @@ -86,6 +111,7 @@ class ModelExporter: self._dump_dataset( self.test_dataset, self.weight_dir / input_, self.weight_dir / labels ) + return self def export_metadata( self, output: PathLike, approx_knobs_file: PathLike = def_approx_knobs_file @@ -98,14 +124,21 @@ class ModelExporter: KnobInfoT = Tuple[str, float] ty_knobs: Dict[str, List[KnobInfoT]] = defaultdict(list) default_knobs: List[KnobInfoT] = [] + baseline_knob = None for k in knobs: - applies_to = k.pop("applies_to") - k = k["name"], k["speedup"] + kp = k["name"], k["speedup"] + if "is_baseline" in k: + if baseline_knob: + raise ValueError("Multiple baseline knobs") + baseline_knob = k["name"] + applies_to = k["applies_to"] if applies_to is None: - default_knobs.append(k) + default_knobs.append(kp) continue for ty in applies_to: - ty_knobs[ty].append(k) + ty_knobs[ty].append(kp) + if not baseline_knob: + raise ValueError("No baseline knob given") idx = 0 op_cost: Dict[str, int] = {} op_knobs: Dict[str, List[str]] = {} @@ -127,18 +160,39 @@ class ModelExporter: "op_cost": op_cost, "knob_speedup": knob_speedup, "op_knobs": op_knobs, + "baseline_knob": baseline_knob, + **self.path_params }, f, indent=2, ) - - def export_all(self, output: PathLike = None, batch_size: Optional[int] = None): - default_codefile = self.output_dir / self.source_file_name - self.export_source_code(output or default_codefile, batch_size) - default_metafile = self.output_dir / self.metadata_file_name - self.export_metadata(default_metafile) + return self + + def compile(self, output_binary: PathLike, working_dir: Optional[PathLike] = None): + from subprocess import run + + args = [ + "approxhpvm.py", + str(self.codefile), + str(output_binary), + *self.compile_args, + ] + if working_dir is not None: + args.extend(["-d", str(working_dir)]) + run(args, check=True) + return self + + def generate( + self, output_code_file: PathLike = None, batch_size: Optional[int] = None + ): + self.codefile = ( + self.codefile if output_code_file is None else Path(output_code_file) + ) + self.export_source_code(self.codefile, batch_size) + self.export_metadata(self.metafile) self.export_weights() self.export_datasets() + return self @staticmethod def _dump_dataset(dataset: DatasetTy, input_filename: Path, labels_filename: Path): @@ -216,7 +270,11 @@ class ModelExporter: return dataset.shape @staticmethod - def _load_model(model: ModelTy, dataset_shape: Sequence[int]) -> onnx.ModelProto: + def _load_model( + model: ModelTy, dataset_shape: Sequence[int], opset: Optional[int] + ) -> onnx.ModelProto: + from onnxsim import simplify + if isinstance(model, Module): # Export to ONNX and load back. sample_input_shape = 1, *dataset_shape[1:] @@ -224,10 +282,16 @@ class ModelExporter: with NamedTemporaryFile("w+b") as tmp: torch_to_onnx(model, (sample_input,), tmp) tmp.seek(0) - return onnx.load_model(tmp) - if isinstance(model, onnx.ModelProto): - return model - return onnx.load(Path(model).as_posix()) + onnx_model = onnx.load_model(tmp) + elif isinstance(model, onnx.ModelProto): + onnx_model = model + else: + raise ValueError(f"Cannot accept model of type {type(model)}") + if opset is not None: + onnx_model = check_onnx_version(onnx_model, opset) + onnx_model, check = simplify(onnx_model) + assert check, "Simplified ONNX model could not be validated" + return onnx.shape_inference.infer_shapes(onnx_model) def check_onnx_version(model, new_version): diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in index d7fd6c88840962b87a973c5d2d7b7aeff800ca52..0c1db9b1ff9d71cb9a8c8bbf3a2c64cec8331476 100644 --- a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in +++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm.cpp.in @@ -99,8 +99,7 @@ int main(int argc, char *argv[]){ void *result = static_cast<RootIn*>(args)->r.tensor; hpvm_request_tensor(result, 0); - uint32_t* labels = readLabelsBatch3(labels_path.c_str(), start, end); - computeAccuracy3(labels, result); + llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); freeBatchMemory(); } __hpvm__cleanup(); diff --git a/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in new file mode 100644 index 0000000000000000000000000000000000000000..94a8e0a534c04b323b4b66f369ab2d624a2a745f --- /dev/null +++ b/hpvm/projects/torch2hpvm/torch2hpvm/template_hpvm_inspect.cpp.in @@ -0,0 +1,168 @@ +#include <fstream> +#include <string> +#include <array> +#include <hpvm.h> +#include <tensorTypes.h> +#include <tensorUtils.h> + +// For writing binary to file descriptors +#include <cstdio> +// For printing error +#include <errno.h> +#include <unistd.h> + +const int batch_size = {{batch_size}}, input_size = {{input_size}}, batch_count = input_size / batch_size; + +/**** Routines for Handling Piped Execution ***/ + +FILE *open_fifo(const char *path, const char *mode) { + auto* fd = fopen(path, mode); + if (!fd) { + std::cerr << "Error opening FIFO file: " << strerror(errno) << '\n'; + abort(); + } + return fd; +} + +int fifo_wait() { + auto* fp = open_fifo("{{fifo_path_r}}", "r"); + const int maxn = 100; + char linebuf[maxn]; + fgets(linebuf, maxn, fp); + fclose(fp); + std::string line(linebuf); + if (line == "test") + return 1; + if (line == "tune") + return 2; + if (line == "stop") + return 0; + std::cerr << "Invalid fifo file content \"" << line << "\"\n"; + abort(); +} + +void fifo_write_batch(FILE *fp, void *output_ptr) { + auto *output = (Tensor *) output_ptr; + const auto &dim = output->dims; + size_t num_dims = dim.num_dims; + fwrite(&num_dims, sizeof(size_t), 1, fp); + fwrite(dim.dim_sizes, sizeof(size_t), dim.num_dims, fp); + fwrite(output->host_data, 1, output->size_in_bytes, fp); +} + +void write_accuracy(float accuracy) { + std::ofstream fout("final_accuracy"); + fout << std::fixed << accuracy; +} + +{% for node in nodes %} +void var_{{node.idx}}_node( +{%- for n in range(node.input_size) -%} +void *t{{n}}, size_t bytes_t{{n}}{{", " if not loop.last}} +{%- endfor %}) { + __hpvm__hint(hpvm::{{target.upper()}}_TARGET); + __hpvm__attributes({{node.input_size}}, {% for n in range(node.input_size) -%} +t{{n}}{{", " if not loop.last}} +{%- endfor %}, 0); + __hpvm__node_id({{node.idx + 1}}); + void *r = {{node.call_name}}({% for n in range(node.input_size) -%} +t{{n}}{{", " if not loop.last}} +{%- endfor %}{{", " if node.call_args}}{{node.call_args|join(", ")}}); + __hpvm__return(2, r, (size_t) 0); +} + +{% endfor -%} + +void root({%- for n in root_inputs -%} +void *{{n}}, size_t {{n}}_bytes{{", " if not loop.last}} +{%- endfor %}) { + __hpvm__hint(hpvm::CPU_TARGET); + __hpvm__attributes({{root_inputs|length}}, {% for n in root_inputs -%} +{{n}}{{", " if not loop.last}} +{%- endfor %}, 0); + +{% for node in nodes %} + void* var_{{node.idx}} = __hpvm__createNodeND(0, var_{{node.idx}}_node); +{% for edge in node.edges %} +{% if edge.is_bindin %} + __hpvm__bindIn(var_{{node.idx}}, {{edge.input_idx * 2}}, {{edge.edge_idx * 2}}, 0); + __hpvm__bindIn(var_{{node.idx}}, {{edge.input_idx * 2 + 1}}, {{edge.edge_idx * 2 + 1}}, 0); +{% else %} + __hpvm__edge(var_{{edge.input_node}}, var_{{node.idx}}, 1, 0, {{edge.edge_idx * 2}}, 0); + __hpvm__edge(var_{{edge.input_node}}, var_{{node.idx}}, 1, 1, {{edge.edge_idx * 2 + 1}}, 0); +{% endif %} +{% endfor %} + +{% endfor %} + __hpvm__bindOut(var_{{root_output_idx}}, 0, 0, 0); + __hpvm__bindOut(var_{{root_output_idx}}, 1, 1, 0); +} + +struct ret_t { + void* tensor; + size_t bytes; +}; + +typedef struct __attribute__((__packed__)) { +{% for n in root_inputs %} + void *{{n}}; + size_t {{n}}_bytes; +{% endfor %} + struct ret_t r; +} RootIn; + +int main(){ + std::string dir_prefix = "{{prefix}}/"; + std::string test_input = dir_prefix + "test_input.bin"; + std::string test_labels = dir_prefix + "test_labels.bin"; + std::string tune_input = dir_prefix + "tune_input.bin"; + std::string tune_labels = dir_prefix + "tune_labels.bin"; + +{% for w in weights %} + std::string {{w.name}}_path = dir_prefix + "{{w.filename}}"; + void* {{w.name}} = readTrainedWeights({{w.name}}_path.c_str(), 0, {{w.shape|join(', ')}}); +{% endfor %} + + RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); +{% for n in root_inputs %} +{% if n != input_name %} + args->{{n}} = {{n}}; + args->{{n}}_bytes = 0; +{% endif %} +{% endfor %} + + int ret = 0; + while ((ret = fifo_wait())) { + __hpvm__init(); + startMemTracking(); + const auto *input_pth = (ret == 1 ? test_input : tune_input).c_str(); + const auto *labels_pth = (ret == 1 ? test_labels : tune_labels).c_str(); + + // Keep this open so the other side knows we have more batches to write + auto* fp = open_fifo("{{fifo_path_w}}", "wb"); + float total_accuracy = 0; + for (int i = 0; i < batch_count; i++){ + int start = i * batch_size, end = start + batch_size; + void *{{input_name}} = readInputBatch(input_pth, 0, start, end, {{input_shape|join(', ')}}); + args->input = {{input_name}}; + args->input_bytes = 0; + + void* dfg = __hpvm__launch(0, root, (void*) args); + __hpvm__wait(dfg); + void *result = static_cast<RootIn*>(args)->r.tensor; + hpvm_request_tensor(result, 0); + + uint32_t* labels = readLabelsBatch3(labels_pth, start, end); + float accuracy = computeAccuracy3(labels, result); + total_accuracy += accuracy * batch_size; + + fifo_write_batch(fp, result); + freeBatchMemory(); + } + fclose(fp); + write_accuracy(total_accuracy / input_size); + __hpvm__cleanup(); + } + + return 0; +} diff --git a/hpvm/scripts/llvm_installer.sh b/hpvm/scripts/llvm_installer.sh index 3ed7fd3a951d27dedc9b84adf82835a0eedbd1e2..a8fa022047fb7983c466b618863a7b2a66a50f92 100755 --- a/hpvm/scripts/llvm_installer.sh +++ b/hpvm/scripts/llvm_installer.sh @@ -184,6 +184,7 @@ if [ ! -d $HPVM_DIR ]; then echo Adding HPVM sources to tree mkdir -p $HPVM_DIR ln -s $CURRENT_DIR/CMakeLists.txt $HPVM_DIR + ln -s $CURRENT_DIR/cmake $HPVM_DIR/ ln -s $CURRENT_DIR/include $HPVM_DIR/ ln -s $CURRENT_DIR/lib $HPVM_DIR/ ln -s $CURRENT_DIR/projects $HPVM_DIR/ @@ -208,7 +209,7 @@ if ! $AUTOMATE ; then echo "To complete installation, follow these instructions:" echo " - Create and navigate to a folder \"./build\" " echo " - Run \"cmake ../llvm [options]\". Find potential options in README.md." - echo " - Run \"make -j<number of threads>\" and then \"make install\"" + echo " - Run \"make -j<number of threads> approxhpvm.py\" and then \"make install\"" echo "For more details refer to README.md." echo echo "Exiting." @@ -237,8 +238,8 @@ cd $BUILD_DIR echo cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR cmake ../$LLVM_SRC -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DLLVM_TARGETS_TO_BUILD=$TARGET -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR -echo make -j$NUM_THREADS -make -j$NUM_THREADS +echo make -j$NUM_THREADS approxhpvm.py +make -j$NUM_THREADS approxhpvm.py #make install if [ -f $BUILD_DIR/tools/hpvm/projects/$HPVM_RT ]; then diff --git a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt index 76d6910d2d43d641f5a2dfff1d48b39fe25686a4..37a856123d1ea9ee074a5ac2844b223a78c56e16 100644 --- a/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt +++ b/hpvm/test/dnn_benchmarks/hpvm-c/CMakeLists.txt @@ -1,5 +1,5 @@ # First get approxhpvm.py which we then use to compile benchmarks. -get_filename_component(APPROXHPVM_PY ${PROJECT_BINARY_DIR}/bin/approxhpvm.py REALPATH) +get_filename_component(APPROXHPVM_PY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py REALPATH) # Configure config.h which tells the benchmarks where's the model parameter directory. # We can also use the one in tensor_runtime, but we're avoiding that so as to diff --git a/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py b/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py index 5918d960a745e5d245410acfac7c827b5b011f14..6ddc1b8ea35df8a098e98a74cfa313cd9bf9e7a8 100644 --- a/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py +++ b/hpvm/test/dnn_benchmarks/pytorch/dnn/_container.py @@ -16,10 +16,10 @@ def make_conv_pool_activ( **conv_kwargs ): layers = [Conv2d(in_channels, out_channels, kernel_size, **conv_kwargs)] - if pool_size is not None: - layers.append(MaxPool2d(pool_size, stride=pool_stride)) if activation: layers.append(activation()) + if pool_size is not None: + layers.append(MaxPool2d(pool_size, stride=pool_stride)) return layers diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py index 7395136eb5f19adc2ad3450c34b60c911f72747e..19f17366459a7684c6df8a940438b661cf7f6029 100644 --- a/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py +++ b/hpvm/test/dnn_benchmarks/pytorch/test_frontend.py @@ -42,15 +42,11 @@ for model_cls, nch, img_size, batch_size, pathname in benchmarks: checkpoint = self_folder / "../model_params" / f"{pathname}.pth.tar" model.load_state_dict(torch.load(checkpoint.as_posix())) - exporter = ModelExporter(model, bin_tuneset, bin_testset, codegen_dir) - exporter.export_all(batch_size=batch_size) - - conf_file = self_folder / "../hpvm-c/benchmarks" / pathname / "data/tuner_confs.txt" build_dir = codegen_dir / "build" target_binary = build_dir / pathname - run([ - "approxhpvm.py", str(codegen_dir / ModelExporter.source_file_name), str(target_binary), - "-d", str(build_dir), - "-t", "tensor", "--conf-file", str(conf_file) - ], check=True) + conf_file = self_folder / "../hpvm-c/benchmarks" / pathname / "data/tuner_confs.txt" + exporter = ModelExporter( + model, bin_tuneset, bin_testset, codegen_dir, config_file=conf_file + ) + exporter.generate(batch_size=batch_size).compile(target_binary, build_dir) run([str(target_binary), "test"], check=True) diff --git a/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py new file mode 100644 index 0000000000000000000000000000000000000000..d0451b70b44325a355345ad95ab9bf85154002c5 --- /dev/null +++ b/hpvm/test/dnn_benchmarks/pytorch/test_tuning.py @@ -0,0 +1,80 @@ +import os +import shutil +import site +from pathlib import Path + +import torch +from predtuner import config_pylogger +from predtuner.pipedbin import PipedBinaryApp +from torch2hpvm import BinDataset, ModelExporter +from torch.nn import Module + +site.addsitedir(os.path.dirname(__file__)) +import dnn + +# Set up logger to put log file in /tmp +msg_logger = config_pylogger(output_dir="/tmp", verbose=True) + + +benchmarks = [ + (dnn.LeNet, 1, 28, 500, "lenet_mnist"), + (dnn.AlexNet, 3, 32, 500, "alexnet_cifar10"), + (dnn.AlexNet2, 3, 32, 500, "alexnet2_cifar10"), + (dnn.AlexNetImageNet, 3, 224, 100, "alexnet_imagenet"), + (dnn.MobileNet, 3, 32, 500, "mobilenet_cifar10"), + (dnn.ResNet18, 3, 32, 500, "resnet18_cifar10"), + (dnn.ResNet50, 3, 224, 50, "resnet50_imagenet"), + (dnn.VGG16Cifar10, 3, 32, 500, "vgg16_cifar10"), + (dnn.VGG16Cifar100, 3, 32, 500, "vgg16_cifar100"), + (dnn.VGG16ImageNet, 3, 224, 50, "vgg16_imagenet"), +] +model_param = Path(__file__).parent / "../model_params" + + +def generate(model_cls, nch, img_size, batch_size, pathname): + codegen_dir = Path(f"/tmp/{pathname}_tune") + build_dir = codegen_dir / "build" + metadata_file = codegen_dir / "ops.json" + binary_file = build_dir / pathname + build_dir = codegen_dir / "build" + # if binary_file.is_file() and metadata_file.is_file(): + # return binary_file, metadata_file + + print(f"Generating {pathname} to {codegen_dir}") + if codegen_dir.exists(): + shutil.rmtree(codegen_dir) + params = model_param / pathname + dataset_shape = 5000, nch, img_size, img_size + bin_tuneset = BinDataset( + params / "tune_input.bin", params / "tune_labels.bin", dataset_shape + ) + bin_testset = BinDataset( + params / "test_input.bin", params / "test_labels.bin", dataset_shape + ) + model: Module = model_cls() + checkpoint = model_param / f"{pathname}.pth.tar" + model.load_state_dict(torch.load(checkpoint.as_posix())) + exporter = ModelExporter( + model, bin_tuneset, bin_testset, codegen_dir, target="hpvm_tensor_inspect" + ) + exporter.generate(batch_size=batch_size).compile(binary_file, build_dir) + return binary_file, metadata_file + + +def main(): + for model_cls, nch, img_size, batch_size, pathname in benchmarks: + print(f"Testing {pathname}") + binary_file, metadata_file = generate( + model_cls, nch, img_size, batch_size, pathname + ) + app = PipedBinaryApp("test", binary_file, metadata_file) + tuner = app.get_tuner() + tuner.tune(100, 3.0, 3.0, True, 50, cost_model="cost_linear") + tuner.dump_configs("configs.json") + fig = tuner.plot_configs(show_qos_loss=True) + fig.savefig("configs.png", dpi=300) + app.dump_hpvm_configs(tuner.best_configs, "hpvm_confs.txt") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hpvm/tools/py-approxhpvm/CMakeLists.txt b/hpvm/tools/py-approxhpvm/CMakeLists.txt index e46c45623f13034e1cb4c5b1ed2434ec40d4c12c..60fbc66aadd362e6aceb507dec5f1bec1223c418 100644 --- a/hpvm/tools/py-approxhpvm/CMakeLists.txt +++ b/hpvm/tools/py-approxhpvm/CMakeLists.txt @@ -1,9 +1,9 @@ # This file is very tightly coupled with main.py.in. # Watch out and keep them in sync. -set(LLVM_PROJECT_DIR ${PROJECT_SOURCE_DIR}) -set(LLVM_BUILD_DIR ${PROJECT_BINARY_DIR}) -set(LIB_DIR ${PROJECT_BINARY_DIR}/lib) +set(LLVM_PROJECT_DIR ${CMAKE_SOURCE_DIR}) +set(LLVM_BUILD_DIR ${CMAKE_BINARY_DIR}) +set(LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) # The hpvm-rt runtime # This has to be explicitly set as hpvm-rt.bc is created in a custom_target # and does not export its file location. @@ -26,7 +26,8 @@ set( LLVMClearDFG LLVMGenHPVM ) -# CUDA_TOOLKIT_ROOT_DIR is already defined +# CUDA_TOOLKIT_ROOT_DIR and CUDNN_LIBRARY_PATH has been defined globally +set(CUDNN_DIR ${CUDNN_LIBRARY_PATH}) # First resolve all `@symbol@` by configuring the file configure_file(main.py.in ${CMAKE_CURRENT_BINARY_DIR}/main.py.conf) # Then resolve all generator expressions we configured into the previous file @@ -51,9 +52,9 @@ set( clang opt llvm-link ) add_custom_command( - OUTPUT ${PROJECT_BINARY_DIR}/bin/approxhpvm.py - COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${PROJECT_BINARY_DIR}/bin/approxhpvm.py - COMMAND chmod +x ${PROJECT_BINARY_DIR}/bin/approxhpvm.py + OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py + COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/main.py ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py + COMMAND chmod +x ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py DEPENDS ${DEPS} ${CMAKE_CURRENT_BINARY_DIR}/main.py ) -add_custom_target(approxhpvm.py ALL DEPENDS ${PROJECT_BINARY_DIR}/bin/approxhpvm.py) +add_custom_target(approxhpvm.py ALL DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/approxhpvm.py) diff --git a/hpvm/tools/py-approxhpvm/main.py.in b/hpvm/tools/py-approxhpvm/main.py.in index 752a7609ca0831838949b037ac7b8c0323ac8871..fdbbaec1ccc070f87bedcd0f0c646e12531d99fe 100644 --- a/hpvm/tools/py-approxhpvm/main.py.in +++ b/hpvm/tools/py-approxhpvm/main.py.in @@ -12,6 +12,7 @@ CUDA_TOOLKIT_ROOT_DIR = Path("@CUDA_TOOLKIT_ROOT_DIR@") TENSOR_RUNTIME_LIBS = "@TENSOR_RUNTIME_LIBS@".split(";") AVAILABLE_PASSES = "@AVAILABLE_PASSES@".split(";") HPVM_RT_PATH = "@HPVM_RT_PATH@" +CUDNN_DIR = "@CUDNN_DIR@" # Directories to include INCLUDE_DIRS = [ @@ -21,7 +22,7 @@ INCLUDE_DIRS = [ HPVM_PROJECT_DIR / "test/dnn_benchmarks/hpvm-c/include", # hpvm-c intrinsics decl dir CUDA_TOOLKIT_ROOT_DIR / "include", # CUDA include dir ] -LINK_DIRS = [CUDA_TOOLKIT_ROOT_DIR / "lib64"] +LINK_DIRS = [CUDA_TOOLKIT_ROOT_DIR / "lib64", CUDNN_DIR] LINK_LIBS = [ "pthread", "cudart", "curand", "cudnn", "cublas", "cufft", "OpenCL", "stdc++fs", "omp", "m" ]