diff --git a/README.md b/README.md
index cc8891b8b4d005758306614569a5253b2249c94b..3cac5cadb819ef890eb12bad858816c9c6dcbd2b 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ Build hpvm
 ```shell
 mkdir install
 mkdir build && cd build
-cmake ../llvm -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DCMAKE_INSTALL_PREFIX=../install
+cmake ../llvm -DCMAKE_BUILD_TYPE=Debug -DLLVM_TARGETS_TO_BUILD="X86" -DCMAKE_INSTALL_PREFIX=../install
 make -j<number of threads you want to use to build hpvm>
 ```
 
diff --git a/llvm/projects/hpvm-tensor-rt/bin/times.py b/llvm/projects/hpvm-tensor-rt/bin/times.py
new file mode 100644
index 0000000000000000000000000000000000000000..082b0d91acb19e70a6c217b25f8747f3197b45b7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/bin/times.py
@@ -0,0 +1,78 @@
+
+
+
+class Config:
+  def __init__(self):
+    self.runtime = 0
+    self.fed_runs = 0
+    self.full_runs = 0
+    
+
+def computeTimes(bench):
+
+  conf_runs = 60
+  fed_time = (bench.runtime * 100) + (bench.fed_runs * conf_runs * bench.runtime)
+  fed_time_hrs = fed_time / (60*60)
+  
+  full_time = (bench.runtime * 1000) + (bench.full_runs * conf_runs * bench.runtime)
+  full_time_hrs = full_time / (60*60)
+    
+  print ("fedtime_hrs = ", fed_time_hrs, " full_time_hrs = ", full_time_hrs, "\n")
+  
+  
+
+if __name__ == "__main__":
+    
+
+  resnet = Config()
+  resnet.runtime = 8
+  resnet.fed_runs = 3
+  resnet.full_runs = 5
+
+  computeTimes(resnet)
+
+  alexnet = Config()
+  alexnet.runtime = 7.8
+  alexnet.fed_runs = 47
+  alexnet.full_runs = 274
+
+  computeTimes(alexnet)
+
+  alexnet2 = Config()
+  alexnet2.runtime = 2.3
+  alexnet2.fed_runs = 62
+  alexnet2.full_runs = 339
+
+  computeTimes(alexnet2)
+
+  vgg1 = Config()
+  vgg1.runtime = 7.4
+  vgg1.fed_runs = 15
+  vgg1.full_runs = 211
+
+  computeTimes(vgg1)
+  
+
+  vgg2 = Config()
+  vgg2.runtime = 15.4
+  vgg2.fed_runs = 8
+  vgg2.full_runs = 150
+
+  computeTimes(vgg2)
+  
+  
+  lenet = Config()
+  lenet.runtime = 0.98
+  lenet.fed_runs = 64
+  lenet.full_runs = 228
+
+  computeTimes(lenet)
+  
+  
+  mobilenet = Config()
+  mobilenet.runtime = 11
+  mobilenet.fed_runs = 32
+  mobilenet.full_runs = 267
+
+  computeTimes(mobilenet)
+
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..095e037430dbf1751dddfd047d0cf0157ad9e2e7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/CMakeLists.txt
@@ -0,0 +1,119 @@
+cmake_minimum_required (VERSION 2.6)
+project (cudnn-training)
+
+find_package(CUDA 6.5 REQUIRED)
+
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+  message("Debug mode")
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=compute_60;-std=c++11;-g;-lineinfo;-Xcompiler;-ggdb;-lcurand)
+else()
+   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_60,code=compute_60;-std=c++11;-DNDEBUG;-Xcompiler;-DNDEBUG;-lcurand)
+endif()
+
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11  -I/  " )
+
+add_definitions(-DNO_INJECTION)
+add_definitions(-DPROMISE_TUNER_ENABLED)
+if(USE_GFLAGS)
+  add_definitions(-DUSE_GFLAGS)
+endif()
+
+if(USE_AUTOTUNER)
+  remove_definitions(-DNO_INJECTION)
+endif()
+
+ 
+
+include_directories($ENV{CUDNN_PATH} /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/include)
+include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/./tensor_runtime/include)
+include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../gpu_profiler/include)
+include_directories(/home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../soc_simulator/include)
+link_directories($ENV{CUDNN_PATH} /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/lib /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/$ENV{CUDNN_PATH}/lib64)
+
+
+cuda_add_library(tensor_runtime /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu)
+cuda_add_cublas_to_target(tensor_runtime)
+
+cuda_add_library(tensor_cpu_runtime /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc)
+
+find_library(GPU_PROFILER_LIB
+    NAMES libgpu_profiler.a
+    HINTS /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../gpu_profiler/lib
+)
+
+find_library(SOC_SIMULATOR_LIB
+    NAMES libpromise_profiler.a
+    HINTS /home/nvidia/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/../soc_simulator/lib
+)
+
+
+if(USE_GFLAGS)
+  target_link_libraries(tensor_runtime gflags cudnn -lcurand)
+else()
+  target_link_libraries(tensor_runtime cudnn -lcurand)
+endif()
+
+target_link_libraries(tensor_cpu_runtime)
+
+# lenet_keras_half_autogenerated_knobs
+add_executable(lenet_keras_fp16_perf20 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf20.cc)
+target_link_libraries(lenet_keras_fp16_perf20 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf26 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf26.cc)
+target_link_libraries(lenet_keras_fp16_perf26 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf22 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf22.cc)
+target_link_libraries(lenet_keras_fp16_perf22 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf25 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf25.cc)
+target_link_libraries(lenet_keras_fp16_perf25 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf23 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf23.cc)
+target_link_libraries(lenet_keras_fp16_perf23 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp33 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp33.cc)
+target_link_libraries(lenet_keras_fp16_samp33 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf24 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf24.cc)
+target_link_libraries(lenet_keras_fp16_perf24 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp31 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp31.cc)
+target_link_libraries(lenet_keras_fp16_samp31 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf30 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf30.cc)
+target_link_libraries(lenet_keras_fp16_perf30 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp36 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp36.cc)
+target_link_libraries(lenet_keras_fp16_samp36 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf21 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf21.cc)
+target_link_libraries(lenet_keras_fp16_perf21 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp34 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp34.cc)
+target_link_libraries(lenet_keras_fp16_samp34 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp32 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp32.cc)
+target_link_libraries(lenet_keras_fp16_samp32 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_samp35 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_samp35.cc)
+target_link_libraries(lenet_keras_fp16_samp35 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf29 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf29.cc)
+target_link_libraries(lenet_keras_fp16_perf29 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf27 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf27.cc)
+target_link_libraries(lenet_keras_fp16_perf27 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+add_executable(lenet_keras_fp16_perf28 lenet_keras_half_autogenerated_knobs/lenet_keras_fp16_perf28.cc)
+target_link_libraries(lenet_keras_fp16_perf28 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+
+
+# lenet_keras_autogenerated_knobs
+add_executable(lenet_keras_fp32_perf20 lenet_keras_autogenerated_knobs/lenet_keras_fp32_perf20.cc)
+target_link_libraries(lenet_keras_fp32_perf20 tensor_runtime ${GPU_PROFILER_LIB} ${SOC_SIMULATOR_LIB})
+
+
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..563d7f4a03b3b3a50e2c08c76616a88ea7958b5a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16.txt
@@ -0,0 +1,7 @@
+../dnn_sources/src/half/profiling/alexnet2_cifar10_half_profiling.cc
+../dnn_sources/src/half/profiling/alexnet_cifar10_half_profiling.cc
+../dnn_sources/src/half/profiling/mobilenet_depthwise_half_profiling.cc
+../dnn_sources/src/half/profiling/mobilenet_shallow_depthwise_half_profiling.cc
+../dnn_sources/src/half/profiling/resnet18_cifar10_half_profiling.cc
+../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc
+../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4a0beb250e2241c7523e69b5262cb9ffc977d28d
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_first_three.txt
@@ -0,0 +1,3 @@
+../dnn_sources/src/half/profiling/alexnet2_cifar10_half_profiling.cc
+../dnn_sources/src/half/profiling/alexnet_cifar10_half_profiling.cc
+../dnn_sources/src/half/profiling/resnet18_cifar10_half_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20ca95abcf1ee1aab337fa391abb5f1a74583fe1
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_remainder.txt
@@ -0,0 +1,4 @@
+../dnn_sources/src/half/profiling/mobilenet_depthwise_half_profiling.cc
+../dnn_sources/src/half/profiling/mobilenet_shallow_depthwise_half_profiling.cc
+../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc
+../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt
new file mode 100644
index 0000000000000000000000000000000000000000..506497e42889dc1d8bb2465912e87f56464e7ecc
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp16_sources.txt
@@ -0,0 +1 @@
+../dnn_sources/src/half/lenet_keras_half.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..12b87930416c4269a62f2020a06b42cf5cf4dc13
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32.txt
@@ -0,0 +1,9 @@
+../dnn_sources/src/profiling/alexnet2_profiling.cc
+../dnn_sources/src/profiling/alexnet_cifar10_profiling.cc
+../dnn_sources/src/profiling/mobilenet_cifar10_profiling.cc
+../dnn_sources/src/profiling/mobilenet_shallow_profiling.cc
+../dnn_sources/src/profiling/mobilenet_depthwise_profiling.cc
+../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc
+../dnn_sources/src/profiling/resnet18_cifar10_profiling.cc
+../dnn_sources/src/profiling/vgg16_cifar100_profiling.cc
+../dnn_sources/src/profiling/vgg16_cifar10_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd8f03c30712f0162db2cc8bcf563087be05bf64
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_sources.txt
@@ -0,0 +1 @@
+../dnn_sources/src/lenet_keras.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a59f773cda240a311c0c873c9366494018b87312
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_fp32_test.txt
@@ -0,0 +1 @@
+../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2b7382da3570917c1983ad0c3fe02763d8565635
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_mobilenet_depth.txt
@@ -0,0 +1,2 @@
+../dnn_sources/src/profiling/mobilenet_depthwise_profiling.cc
+../dnn_sources/src/profiling/mobilenet_shallow_depthwise_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt
new file mode 100644
index 0000000000000000000000000000000000000000..32b18d4ca22672be6b44ecb674ea3ad00e18276d
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/filenames_one_file.txt
@@ -0,0 +1,2 @@
+../dnn_sources/src/half/profiling/vgg16_cifar100_half_profiling.cc
+../dnn_sources/src/half/profiling/vgg16_cifar10_half_profiling.cc
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..207eb1ed1f45ffde7dad0da4e125aa0ceaa5c5cd
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16.txt
@@ -0,0 +1,17 @@
+perf,20 1,1,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,21 1,2,1,0   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,22 1,2,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,23 1,3,1,0   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,24 1,3,1,1   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,25 1,3,1,2   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,26 2,1,1,0   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,27 2,1,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,28 3,1,1,0   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,29 3,1,1,1   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,30 3,1,1,2   1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,31 1,1,2,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,32 1,1,2,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,33 1,1,4,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,34 1,1,4,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,35 1,1,4,2     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,36 1,1,4,3     1.88    tensorHalfConvolution   tensorConvApproxHalf
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fc76565110cf34ab57024dd852c1a51b23a8f45e
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_knobs_31_36.txt
@@ -0,0 +1,6 @@
+samp,31 1,1,2,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,32 1,1,2,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,33 1,1,4,0     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,34 1,1,4,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,35 1,1,4,2     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,36 1,1,4,3     1.88    tensorHalfConvolution   tensorConvApproxHalf
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72c43e61288c532feed94f5768357b3113d5de49
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_old.txt
@@ -0,0 +1,18 @@
+perf,20 1,1,0   2.25    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,21 1,2,0   2.25    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,22 1,2,1   2.25    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,23 1,3,0   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,24 1,3,1   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,25 1,3,2   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,26 2,1,0   2.25    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,27 2,1,1   2.25    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,28 3,1,0   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,29 3,1,1   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+perf,30 3,1,2   1.88    tensorHalfConvolution   tensorConvPerfCudaHalf
+samp,31 2,0     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,32 2,1     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,33 4,0     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,34 4,1     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,35 4,2     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,36 4,3     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,37 1,1     1.88    tensorHalfConvolution   tensorConvInputHalf
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0f0593226f6fbeddda91046e7416fe108bfb6d90
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_samp.txt
@@ -0,0 +1,7 @@
+samp,31 2,0     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,32 2,1     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,33 4,0     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,34 4,1     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,35 4,2     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,36 4,3     1.88    tensorHalfConvolution   tensorConvInputHalf
+samp,37 1,1     1.88    tensorHalfConvolution   tensorConvInputHalf
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a172a4e515ebfd24a51267da8bac2cb5f13ce6c0
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp16_vgg16.txt
@@ -0,0 +1,13 @@
+perf,20 1,1,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,21 1,2,1,0   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,22 1,2,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,23 1,3,1,0   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,24 1,3,1,1   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,25 1,3,1,2   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,26 2,1,1,0   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,27 2,1,1,1   2.25    tensorHalfConvolution   tensorConvApproxHalf
+perf,28 3,1,1,0   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,29 3,1,1,1   1.88    tensorHalfConvolution   tensorConvApproxHalf
+perf,30 3,1,1,2   1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,32 1,1,2,1     1.88    tensorHalfConvolution   tensorConvApproxHalf
+samp,36 1,1,4,3     1.88    tensorHalfConvolution   tensorConvApproxHalf
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78f3e361ee8a96c6520793b435815210e1fc7117
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32.txt
@@ -0,0 +1,17 @@
+perf,20 1,1,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,21 1,2,1,0   2.25    tensorConvolution   tensorConvApprox
+perf,22 1,2,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,23 1,3,1,0   1.88    tensorConvolution   tensorConvApprox
+perf,24 1,3,1,1   1.88    tensorConvolution   tensorConvApprox
+perf,25 1,3,1,2   1.88    tensorConvolution   tensorConvApprox
+perf,26 2,1,1,0   2.25    tensorConvolution   tensorConvApprox
+perf,27 2,1,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,28 3,1,1,0   1.88    tensorConvolution   tensorConvApprox
+perf,29 3,1,1,1   1.88    tensorConvolution   tensorConvApprox
+perf,30 3,1,1,2   1.88    tensorConvolution   tensorConvApprox
+samp,31 1,1,2,0     1.88    tensorConvolution   tensorConvApprox
+samp,32 1,1,2,1     1.88    tensorConvolution   tensorConvApprox
+samp,33 1,1,4,0     1.88    tensorConvolution   tensorConvApprox
+samp,34 1,1,4,1     1.88    tensorConvolution   tensorConvApprox
+samp,35 1,1,4,2     1.88    tensorConvolution   tensorConvApprox
+samp,36 1,1,4,3     1.88    tensorConvolution   tensorConvApprox
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df001ba497d0ed440dd34beead33d607651d3f35
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_baseline.txt
@@ -0,0 +1 @@
+perf,20 1,1,1,1   2.25    tensorConvolution   tensorConvApprox
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36a7dbca05ef71b6046a91066acf5382f2a5c7a3
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_old.txt
@@ -0,0 +1,11 @@
+perf,20 1,1,0   2.25    tensorConvolution   tensorConvPerfCuda
+perf,21 1,2,0   2.25    tensorConvolution   tensorConvPerfCuda
+perf,22 1,2,1   2.25    tensorConvolution   tensorConvPerfCuda
+perf,23 1,3,0   1.88    tensorConvolution   tensorConvPerfCuda
+perf,24 1,3,1   1.88    tensorConvolution   tensorConvPerfCuda
+perf,25 1,3,2   1.88    tensorConvolution   tensorConvPerfCuda
+perf,26 2,1,0   2.25    tensorConvolution   tensorConvPerfCuda
+perf,27 2,1,1   2.25    tensorConvolution   tensorConvPerfCuda
+perf,28 3,1,0   1.88    tensorConvolution   tensorConvPerfCuda
+perf,29 3,1,1   1.88    tensorConvolution   tensorConvPerfCuda
+perf,30 3,1,2   1.88    tensorConvolution   tensorConvPerfCuda
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..913397cc4936bf11f3eefa15b5804700865e7b6b
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_to_fp16.txt
@@ -0,0 +1 @@
+fp16,12 0   1.5     tensorConvolution   tensorHalfConvolution
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fbab7d7b85255cd86748634faea0bf48ed75e42
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_fp32_vgg16.txt
@@ -0,0 +1,13 @@
+perf,20 1,1,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,21 1,2,1,0   2.25    tensorConvolution   tensorConvApprox
+perf,22 1,2,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,23 1,3,1,0   1.88    tensorConvolution   tensorConvApprox
+perf,24 1,3,1,1   1.88    tensorConvolution   tensorConvApprox
+perf,25 1,3,1,2   1.88    tensorConvolution   tensorConvApprox
+perf,26 2,1,1,0   2.25    tensorConvolution   tensorConvApprox
+perf,27 2,1,1,1   2.25    tensorConvolution   tensorConvApprox
+perf,28 3,1,1,0   1.88    tensorConvolution   tensorConvApprox
+perf,29 3,1,1,1   1.88    tensorConvolution   tensorConvApprox
+perf,30 3,1,1,2   1.88    tensorConvolution   tensorConvApprox
+samp,32 1,1,2,1     1.88    tensorConvolution   tensorConvApprox
+samp,36 1,1,4,3     1.88    tensorConvolution   tensorConvApprox
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..68686b25de1c607e34d75044cd7ff19cf0c8890a
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/knob_config_test.txt
@@ -0,0 +1 @@
+fp16,12 0   1.5     tensorHalfConvolution   tensorHalfConvolution
diff --git a/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py b/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
index f53573f7cde9420400194827d55d84d69e2ace5b..d6c3d63112c83cd9b545914a9a33f4c5b5dae6ce 100644
--- a/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
+++ b/llvm/projects/soc_simulator/src/driver_new_config_fp16_repl.py
@@ -4,8 +4,6 @@ import subprocess
 import sys
 
 class Driver:
-    fp16_swing = 8
-
     class PrecisionTypes:
         FP16 = 0
         FP32 = 1
@@ -14,6 +12,7 @@ class Driver:
     class ApproxTypes:
         PERF = 3 
         SAMP = 4
+        REDUCE = 5
 
     results_time_key = "Time"
     results_energy_key = "Energy"
@@ -65,7 +64,8 @@ class Driver:
             return "PERF"
         elif appr == Driver.ApproxTypes.SAMP:
             return "SAMP"
-
+        elif appr == Driver.ApproxTypes.REDUCE:
+            return "REDUCE"
 
     def driver(self):
         self.__parse_tensor_layer_file()
@@ -189,7 +189,6 @@ class Driver:
                     curr_conf_results.append((layer_as_lst[1], layer_results))
                     line = config_file.readline().strip()
                     continue
-                
                 layer_ind = int(layer_as_lst[0]) - 1
                 layer_table_data = self.__tensor_layers[layer_ind]
                 layer_name = layer_table_data["Name"]
@@ -208,6 +207,8 @@ class Driver:
                         time, energy = self.__run_promise_simulation(param_val, layer_table_data)
                         total_time += time
                         total_energy += energy
+                        print("Curr promise: ", time, energy)
+                    print("Total promise: ", total_time, total_energy)
                     layer_results.append((total_time, total_energy, ' '.join(layer_as_lst[2:])))
 
                 elif Driver.is_gpu(layer_as_lst[1]):
@@ -227,22 +228,23 @@ class Driver:
                             curr_layer = Driver.PrecisionTypes.FP16
                         elif line.find("fp32") != -1:
                             curr_layer = Driver.PrecisionTypes.FP32
-                        if precision_type == "perf" or precision_type == "samp": # Handle approx type
+                        if precision_type == "perf" or precision_type == "samp" or precision_type == "reduce": # Handle approx type
                             if precision_type == "perf": 
                                 approx_type = Driver.ApproxTypes.PERF
                             elif precision_type == "samp": 
                                 approx_type = Driver.ApproxTypes.SAMP
+                            elif precision_type == "reduce":
+                                approx_type = Driver.ApproxTypes.REDUCE
                             curr_layer = Driver.PrecisionTypes.FP16
-                        print(curr_layer, prev_layer)
                         quant_time, quant_energy = self.__quantize(precision_type, op_number, curr_layer, prev_layer, tensor_count, layer_table_data)
                         if quant_time != 0:
                             assert i == 2 #and layer_ind == 0
                         conv_time, conv_energy = self.__run_gpu_simulation(curr_layer, layer_name, \
                                     tensor_count, approx_type, op_number) 
+                        print(quant_time, conv_time)
                         layer_results.append((quant_time + conv_time, quant_energy + conv_energy, ' '.join(layer_as_lst[i : i + 3])))
                         prev_layer = curr_layer
                         tensor_count += 1
-
                 line = config_file.readline().strip()
                 prev_layer = curr_layer
                 curr_conf_results.append((layer_as_lst[1], layer_results))
@@ -256,9 +258,8 @@ class Driver:
 
                 has_quantized = False
                 for layer_ind, (hardware, layer) in enumerate(curr_conf_results):
-                    if len(layer) == 1 and layer[0][2].find("softmax") != -1: continue
+                    if layer[0][2].find("softmax") != -1: continue
                     fp16_layer = []
-                    #print(layer_ind, hardware, layer)
                     layer_table_data = self.__tensor_layers[layer_ind]
                     layer_name = layer_table_data["Name"]
 
@@ -287,7 +288,8 @@ class Driver:
                     or prev_layer == Driver.PrecisionTypes.PROMISE:
             return 0.0, 0.0
         layer_name = layer_data["Name"]
-
+        print("QUANTIZATION")
+        print(precision_type, op_number, self.__get_str(curr_layer), self.__get_str(prev_layer), h2f_f2h_operation_ind, layer_data)
         # NOTE: Ignoring logic where curr == promise or prev == promise bc 
         # smartDMA is always true so we'd return near the beginning of the method
 
@@ -302,17 +304,16 @@ class Driver:
         else:
             lookup_key = "_" + precision_type + str(op_number) + "_"
 
-        print(curr_layer)
         if curr_layer == Driver.PrecisionTypes.FP32:
             time_key = "h2f%stime" % lookup_key
             energy_key = "h2f%senergy" % lookup_key
         elif curr_layer == Driver.PrecisionTypes.FP16:
             time_key = "f2h%stime" % lookup_key
             energy_key = "f2h%senergy" % lookup_key
+        print(tensor_op_row)
         time = tensor_op_row[time_key]
         energy = tensor_op_row[energy_key]
         print(time_key, energy_key)
-        print("Quantization: (%f, %f)" % (time, energy))
         return (time, energy)
 
 
@@ -330,7 +331,7 @@ class Driver:
         elif Driver.is_fc(layer_name):
             rows_a = layer_data["RA"] 
             cols_a = layer_data["CA"]
-            rows_b = cols_
+            rows_b = layer_data["RB"] 
             cols_b = layer_data["CB"]
         else:
             print("PROMISE can't run whatever this layer is.")
@@ -349,18 +350,17 @@ class Driver:
     def __run_gpu_simulation(self, curr_layer, layer_name, tensor_ind, \
                     approx_type = None, knob_number = None):
         tensor_info = self.__tensor_table[layer_name][tensor_ind]
-        #print(tensor_info)
-        #print(layer_name)
-        #print(tensor_ind)
         time_key = None
         energy_key = None
 
-        if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP: # fp16_perf2_energy
+        if approx_type == Driver.ApproxTypes.PERF or approx_type == Driver.ApproxTypes.SAMP or approx_type == Driver.ApproxTypes.REDUCE: # fp16_perf2_energy
             approx_type_str = None
             if approx_type == Driver.ApproxTypes.PERF:
                 approx_type_str = "perf"
             elif approx_type == Driver.ApproxTypes.SAMP: 
                 approx_type_str = "samp"
+            elif approx_type == Driver.ApproxTypes.REDUCE:
+                approx_type_str = "reduce"
 
             if curr_layer == Driver.PrecisionTypes.FP32:
                 time_key = "fp32_%s%s_time" % (approx_type_str, knob_number)
@@ -414,7 +414,7 @@ class Driver:
             conf_str.append("-----\n")
             results_file.write('\n'.join(conf_str))
 
-        baseline_conf = None
+        fp32_baseline_conf = None
         baseline_total_time = baseline_total_energy = 0 
 
         def get_baseline_times_energies(conf):
@@ -427,7 +427,7 @@ class Driver:
 
         def get_final_times_energies_conf(curr_conf, curr_conf_name):
             final_time = final_energy = 0
-           
+
             final_conf = [] # List (conf) of lists (layers) of tuples (operation data)
 
             #for hardware, layer in self.fp16_baseline:
@@ -440,24 +440,48 @@ class Driver:
                         final_conf_layer.append((None, None, tensor_op))
                         continue
                     # layer name, operation name, val name
-                    baseline_time = self.fp16_baseline[layer_ind][1][tensor_ind][0]
-                    baseline_energy = self.fp16_baseline[layer_ind][1][tensor_ind][1]
-                    baseline_op = self.fp16_baseline[layer_ind][1][tensor_ind][2]
-                    #print(baseline_time, baseline_energy, baseline_op)
-                    #print(op_time, tensor_op)
+                    if tensor_op.find("promise") != -1: # compute sum of entire fp16 baseline layer
+                        baseline_time = 0
+                        baseline_energy = 0
+                        baseline_op = []
+
+                        if tensor_op.find("fp32") != -1:
+                            assert False
+                            baseline_layer = fp32_baseline_conf[layer_ind][1]
+                        else:
+                            baseline_layer = self.fp16_baseline[layer_ind][1]
+
+                        for op_time, op_energy, tensor_op in baseline_layer:
+                            baseline_time += op_time
+                            baseline_energy += op_energy
+                            baseline_op.append(tensor_op)
+                    else: # look at the individual tensor operation as before
+                        if tensor_op.find("fp32") != -1:
+                            assert False
+                            baseline_layer = fp32_baseline_conf[1][layer_ind]
+                        else:
+                            baseline_layer = self.fp16_baseline[layer_ind][1]
+                        baseline_time = baseline_layer[tensor_ind][0]
+                        baseline_energy = baseline_layer[tensor_ind][1]
+                        baseline_op = baseline_layer[tensor_ind][2]
+
                     final_tensor_op = tensor_op
-                    #print(op_time > baseline_time)
                     if op_time > baseline_time:
-                        #print("**************** BIGGER ******************")
-                        #print(curr_conf_name)
-                        #print(baseline_time, baseline_energy, baseline_op, layer_ind)
-                        #print(op_time, tensor_op, layer_ind)
+                        print("**************** BIGGER ******************")
+                        print(curr_conf_name)
+                        print(baseline_time, baseline_energy, baseline_op, layer_ind)
+                        print(op_time, tensor_op, layer_ind)
                         final_time += baseline_time
                         final_energy += baseline_energy
                         final_tensor_op = baseline_op
                     else:
+                        print("**************** SMALLER ******************")
+                        print(curr_conf_name)
+                        print(baseline_time, baseline_energy, baseline_op, layer_ind)
+                        print(op_time, tensor_op, layer_ind)
                         final_time += op_time
                         final_energy += op_energy
+
                     final_conf_layer.append((None, None, final_tensor_op)) # Don't care about the times and energies when writing
                 final_conf.append((hardware, final_conf_layer))
             #print("\n")
@@ -470,15 +494,15 @@ class Driver:
                 orig_line_lst = line.split(' ')
                 conf_name = orig_line_lst[0]
 
-                if not baseline_conf:
-                    baseline_conf = self.__conf_results[conf_index] #conf_name]
-                    baseline_total_time, baseline_total_energy = get_baseline_times_energies(baseline_conf)
+                if not fp32_baseline_conf:
+                    fp32_baseline_conf = self.__conf_results[conf_index] #conf_name]
+                    baseline_total_time, baseline_total_energy = get_baseline_times_energies(fp32_baseline_conf)
                     results_file.write("%s\n" % repr(baseline_total_time))
-                    write_conf_to_file(conf_name, baseline_conf, 1, 1)
+                    write_conf_to_file(conf_name, fp32_baseline_conf, 1, 1)
                 else:
                     curr_conf = self.__conf_results[conf_index] #conf_name]
-                    #final_time, final_energy, = get_baseline_times_energies(curr_conf)
                     final_time, final_energy, curr_conf = get_final_times_energies_conf(curr_conf, conf_name)
+                    print("Baseline time: %f, final time: %f, baseline energy: %f, final energy: %f, rations: %f %f " % (baseline_total_time, final_time, baseline_total_energy, final_energy, baseline_total_time / final_time, baseline_total_energy / final_energy))
                     write_conf_to_file(conf_name, curr_conf, baseline_total_time / final_time, baseline_total_energy / final_energy) 
                 conf_index += 1
         results_file.close()
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt
new file mode 100644
index 0000000000000000000000000000000000000000..20b92832d433de5c65f50c946c50153e1d3eebc9
--- /dev/null
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet/data/autotuner_data/tuner_pareto_confs_batch220.txt
@@ -0,0 +1,904 @@
++++++
+conf1 1 0 99.69 0
+1 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 
+2 gpu conv fp32 1 add fp32 1 pool_max fp32 1 tanh fp32 1 
+3 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+4 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf1 2.01610051566 0 99.400002 0.6899979999999971
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf2 2.01610051566 0 99.040001 0.974998499999991
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf3 2.00016617632 0 99.68 0.4099999999999909
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf4 2.00016617632 0 99.660004 0.42999599999999705
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf5 1.97610564729 0 99.599998 0.4900019999999984
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf6 2.00016617632 0 99.599998 0.4900019999999984
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf7 2.00016617632 0 99.080002 0.9149970000000067
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf8 2.00016617632 0 99.239998 0.6750029999999967
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf9 2.00016617632 0 99.199997 0.7350045000000023
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf10 1.99590274244 0 99.099998 0.8850029999999975
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf11 2.01610051566 0 99.559998 0.5300020000000046
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf12 1.99590274244 0 99.540001 0.549998999999994
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf13 2.00016617632 0 99.639999 0.45000099999999466
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf14 1.99590274244 0 99.580002 0.5099980000000045
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf15 2.01610051566 0 99.099998 0.8850029999999975
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf16 2.01610051566 0 99.160004 0.7949939999999955
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf17 2.00016617632 0 99.379997 0.46500449999999205
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf18 1.99590274244 0 99.639999 0.45000099999999466
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf19 2.01610051566 0 99.580002 0.5099980000000045
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf20 1.97610564729 0 99.660004 0.42999599999999705
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf21 1.99590274244 0 99.440002 0.6499979999999909
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf22 1.99590274244 0 99.260002 0.6449969999999965
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf23 2.00016617632 0 99.360001 0.49499850000000123
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf24 2.01610051566 0 99.32 0.5550000000000068
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf25 2.00016617632 0 99.519997 0.5700029999999942
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf26 1.97610564729 0 99.379997 0.46500449999999205
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf27 2.01610051566 0 99.68 0.4099999999999909
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf28 2.00016617632 0 99.559998 0.5300020000000046
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf29 2.00016617632 0 99.080002 0.9149970000000067
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf30 1.97610564729 0 99.660004 0.42999599999999705
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf31 2.01610051566 0 99.599998 0.4900019999999984
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf32 1.97610564729 0 99.080002 0.9149970000000067
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf33 2.01610051566 0 99.620003 0.4699970000000008
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf34 2.00016617632 0 99.620003 0.4699970000000008
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf35 2.00016617632 0 99.599998 0.4900019999999984
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf36 1.99590274244 0 99.599998 0.4900019999999984
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf37 2.01610051566 0 99.540001 0.549998999999994
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf38 2.00016617632 0 99.339996 0.5250059999999976
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf39 2.00016617632 0 99.599998 0.4900019999999984
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf40 1.97610564729 0 99.379997 0.46500449999999205
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf41 2.00016617632 0 99.559998 0.5300020000000046
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf42 1.99590274244 0 99.459999 0.6300010000000015
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf43 1.99590274244 0 99.400002 0.6899979999999971
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf44 2.00016617632 0 99.599998 0.4900019999999984
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf45 2.01610051566 0 99.599998 0.4900019999999984
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf46 2.01610051566 0 99.080002 0.9149970000000067
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf47 2.01610051566 0 99.660004 0.42999599999999705
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf48 2.00016617632 0 99.639999 0.45000099999999466
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf49 2.01610051566 0 99.480003 0.6099970000000013
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf50 2.00016617632 0 98.400002 1.9349969999999956
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf51 2.01610051566 0 98.540001 1.724998499999991
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf52 2.01610051566 0 99.080002 0.9149970000000067
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf53 2.00016617632 0 99.660004 0.42999599999999705
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf54 2.01610051566 0 99.660004 0.42999599999999705
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf55 1.97610564729 0 99.599998 0.4900019999999984
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf56 2.01610051566 0 98.900002 1.1849969999999956
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf57 1.99590274244 0 99.099998 0.8850029999999975
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf58 2.01610051566 0 99.580002 0.5099980000000045
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf59 1.97610564729 0 99.080002 0.9149970000000067
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf60 2.01610051566 0 98.959999 1.0950015000000022
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf61 2.01610051566 0 99.220001 0.7049985000000021
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf62 2.01610051566 0 98.839996 1.2750059999999976
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf63 1.99590274244 0 98.940002 1.1249969999999863
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf64 1.97610564729 0 99.379997 0.46500449999999205
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf65 2.00016617632 0 99.559998 0.5300020000000046
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf66 2.00016617632 0 99.239998 0.6750029999999967
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf67 2.01610051566 0 99.459999 0.6300010000000015
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf68 2.00016617632 0 99.360001 0.49499850000000123
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf69 2.01610051566 0 99.559998 0.5300020000000046
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf70 1.99590274244 0 99.440002 0.6499979999999909
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf71 2.00016617632 0 99.339996 0.5250059999999976
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf72 2.01610051566 0 99.32 0.5550000000000068
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf73 1.97610564729 0 99.379997 0.46500449999999205
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf74 2.00016617632 0 99.019997 1.0050044999999912
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf75 1.99590274244 0 99.260002 0.6449969999999965
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf76 2.01610051566 0 99.099998 0.8850029999999975
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf77 1.97610564729 0 98.440002 1.8749969999999863
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf78 2.01610051566 0 98.440002 1.8749969999999863
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf79 2.01610051566 0 99.160004 0.7949939999999955
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf80 1.97610564729 0 98.480003 1.814995500000002
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf81 2.00016617632 0 99.360001 0.49499850000000123
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf82 1.97610564729 0 99.660004 0.42999599999999705
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf83 1.99590274244 0 99.540001 0.549998999999994
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf84 2.00016617632 0 99.199997 0.7350045000000023
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf85 1.97610564729 0 98.440002 1.8749969999999863
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf86 2.00016617632 0 99.0 1.0349999999999966
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf87 1.99590274244 0 98.519997 1.7550044999999912
+1 gpu conv samp 35 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf88 2.01610051566 0 99.400002 0.6899979999999971
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf89 2.01610051566 0 97.760002 2.8949969999999965
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf90 2.01610051566 0 99.519997 0.5700029999999942
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf91 2.01610051566 0 99.32 0.5550000000000068
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf92 2.01610051566 0 99.580002 0.5099980000000045
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf93 2.01610051566 0 99.480003 0.6099970000000013
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf94 2.01610051566 0 98.480003 1.814995500000002
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf95 2.01610051566 0 98.540001 1.724998499999991
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf96 2.01610051566 0 97.82 2.805000000000007
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf97 2.01610051566 0 98.959999 1.0950015000000022
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf98 2.01610051566 0 98.459999 1.8450015000000022
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf99 2.01610051566 0 99.660004 0.42999599999999705
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf100 2.01610051566 0 99.620003 0.4699970000000008
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf101 2.01610051566 0 97.699997 2.9850045000000023
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf102 2.01610051566 0 99.040001 0.974998499999991
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf103 2.01610051566 0 98.0 2.5349999999999966
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf104 2.01610051566 0 99.160004 0.7949939999999955
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf105 2.01610051566 0 99.540001 0.549998999999994
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf106 2.01610051566 0 99.519997 0.5700029999999942
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf107 2.01610051566 0 99.099998 0.8850029999999975
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf108 2.01610051566 0 98.120003 2.354995500000001
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf109 2.01610051566 0 99.459999 0.6300010000000015
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf110 2.01610051566 0 99.68 0.4099999999999909
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf111 2.01610051566 0 98.839996 1.2750059999999976
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
++++++
+conf112 2.01610051566 0 98.18 2.2649999999999864
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+5 gpu softmax fp32 1
+-----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
index 3e4f668a2c157b3c6a2abcea9da19819f6dabaef..578cfc713eef378bfb23222b4ed3e8b1abd7e7d9 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_dsoc/Makefile
@@ -1,5 +1,5 @@
 DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks
-# NOTE: can configure build directory
+# NOTE: CHANGE to your BUILD DIRECTORY
 HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_dsoc/
 
 CC = $(HPVM_BUILD_DIR)/bin/clang++
@@ -22,8 +22,6 @@ DNN_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/dnn_sources/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_RT_SRC_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/src
 
-# -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0
-# -I $(TENSOR_INCLUDE_DIR)
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR)  -I $(DNN_INCLUDE_DIR) -I $(COMMON_INCLUDE_DIR)  -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math  -std=c++11   -O3
 LINKER_FLAGS = -lpthread -lOpenCL
 
@@ -34,10 +32,10 @@ OPTFLAGS1 = -load  $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInP
 
 OPTFLAGS2 = -load  $(HPVM_LIB_DIR)/InlineTensorCalls.so  -inline-tensor-calls
 
-TARGET = $(BUILD_DIR)/$(APP).opt.bc
+TARGET = $(BUILD_DIR)/$(APP).final.bc
+
 SOURCES = $(SRC_DIR)/$(APP).cpp
 VISC_RT_PATH = $(LLVM_SRC_ROOT)/projects/visc-cpu-rt/visc-rt.ll
-#VISC_RT_PATH = $(HPVM_BUILD_DIR)/projects/visc-rt/visc-rt.ll
 
 
 .PRECIOUS: $(BUILD_DIR)/$(APP).ll $(BUILD_DIR)/$(APP).visc.ll
@@ -54,17 +52,19 @@ $(BUILD_DIR)/%.visc.ll: $(BUILD_DIR)/%.ll
 
 expanded_modules:= $(wildcard *_module.ll)
 
-
-#$(wildcard build/_*.ll)
-
 $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.visc.ll
 	$(OPT) $(OPTFLAGS1) $<  -o $@
+
+
+$(BUILD_DIR)/%.linked.bc: $(BUILD_DIR)/%.opt.bc
 	$(CC) -emit-llvm  -c  $(TENSOR_RT_SRC_DIR)/tensor_cpu_runtime.cc  -o  $(BUILD_DIR)/tensor_cpu_runtime.bc
 	$(OPT) -always-inline $(BUILD_DIR)/tensor_cpu_runtime.bc  -o  $(BUILD_DIR)/tensor_cpu_runtime.bc
-        #LL_FILES = $(shell cd build; find ./ -name "*module.ll")
-	$(LLVM_LINK) $@  $(shell find ./build -name "*module.ll")   $(BUILD_DIR)/tensor_cpu_runtime.bc $(VISC_RT_PATH)  -o  $(BUILD_DIR)/lenet_tensor_rt.bc
-	$(OPT) $(OPTFLAGS2)  $(BUILD_DIR)/lenet_tensor_rt.bc -o  $(BUILD_DIR)/lenet_inline.bc
-	$(CC) $(BUILD_DIR)/lenet_inline.bc -o $(BUILD_DIR)/lenet_final $(LINKER_FLAGS)
+	$(LLVM_LINK)   $<   $(shell find ./build -name "*module.ll")   $(BUILD_DIR)/tensor_cpu_runtime.bc $(VISC_RT_PATH)  -o  $@   
+
+
+$(BUILD_DIR)/%.final.bc: $(BUILD_DIR)/%.linked.bc
+	$(OPT) $(OPTFLAGS2)  $<  -o  $@ 
+	$(CC) $@ -o $(BUILD_DIR)/$(APP)_final  $(LINKER_FLAGS)
 	$(foreach module, $(expanded_modules), $(LLVM_LINK) $(module) $(BUILD_DIR)/tensor_cpu_runtime.bc -o $(BUILD_DIR)/$(module)_linked ${\n} $(OPT) $(OPTFLAGS2) $(BUILD_DIR)/$(module)_linked -o  $(BUILD_DIR)/$(module)_inline  ${\n} )
 
 
@@ -74,3 +74,6 @@ $(BUILD_DIR):
 
 clean:
 	rm -rf $(BUILD_DIR)
+
+
+
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt
index 2e3185632ca5cd156a599f4e0a7999c16fd4be97..707fd70be086b8961875c2cfd94ba1f41d2ac208 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/autotuner_data/tuner_pareto_confs_batch220.txt
@@ -1,896 +1,904 @@
 +++++
+conf1 1 0 99.69 0
+1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1
+2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1
+3 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+4 gpu mul fp32 1 add fp32 1 tanh fp32 1 
+5 gpu softmax fp32 1
+-----
++++++
 conf1 2.01610051566 0 99.400002 0.6899979999999971
-1 gpu conv perf 21 add fp32 1 pool_max fp32 1 tanh fp32 1 
-2 gpu conv perf 21 add fp32 1 pool_max fp32 1 tanh fp32 1 
-3 gpu mul fp16 1 add fp32 1 tanh fp32 1 
-4 gpu mul fp16 1 add fp32 1 tanh fp32 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
+4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 5 gpu softmax fp32 1
 -----
 +++++
 conf2 2.01610051566 0 99.040001 0.974998499999991
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf3 2.00016617632 0 99.68 0.4099999999999909
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf4 2.00016617632 0 99.660004 0.42999599999999705
-1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf5 1.97610564729 0 99.599998 0.4900019999999984
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf6 2.00016617632 0 99.599998 0.4900019999999984
-1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf7 2.00016617632 0 99.080002 0.9149970000000067
-1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf8 2.00016617632 0 99.239998 0.6750029999999967
-1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf9 2.00016617632 0 99.199997 0.7350045000000023
-1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf10 1.99590274244 0 99.099998 0.8850029999999975
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf11 2.01610051566 0 99.559998 0.5300020000000046
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf12 1.99590274244 0 99.540001 0.549998999999994
-1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf13 2.00016617632 0 99.639999 0.45000099999999466
-1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf14 1.99590274244 0 99.580002 0.5099980000000045
-1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf15 2.01610051566 0 99.099998 0.8850029999999975
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf16 2.01610051566 0 99.160004 0.7949939999999955
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf17 2.00016617632 0 99.379997 0.46500449999999205
-1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf18 1.99590274244 0 99.639999 0.45000099999999466
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf19 2.01610051566 0 99.580002 0.5099980000000045
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf20 1.97610564729 0 99.660004 0.42999599999999705
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf21 1.99590274244 0 99.440002 0.6499979999999909
-1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf22 1.99590274244 0 99.260002 0.6449969999999965
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf23 2.00016617632 0 99.360001 0.49499850000000123
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf24 2.01610051566 0 99.32 0.5550000000000068
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf25 2.00016617632 0 99.519997 0.5700029999999942
-1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf26 1.97610564729 0 99.379997 0.46500449999999205
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf27 2.01610051566 0 99.68 0.4099999999999909
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf28 2.00016617632 0 99.559998 0.5300020000000046
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf29 2.00016617632 0 99.080002 0.9149970000000067
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf30 1.97610564729 0 99.660004 0.42999599999999705
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf31 2.01610051566 0 99.599998 0.4900019999999984
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf32 1.97610564729 0 99.080002 0.9149970000000067
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf33 2.01610051566 0 99.620003 0.4699970000000008
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf34 2.00016617632 0 99.620003 0.4699970000000008
-1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf35 2.00016617632 0 99.599998 0.4900019999999984
-1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf36 1.99590274244 0 99.599998 0.4900019999999984
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf37 2.01610051566 0 99.540001 0.549998999999994
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf38 2.00016617632 0 99.339996 0.5250059999999976
-1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf39 2.00016617632 0 99.599998 0.4900019999999984
-1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf40 1.97610564729 0 99.379997 0.46500449999999205
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf41 2.00016617632 0 99.559998 0.5300020000000046
-1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf42 1.99590274244 0 99.459999 0.6300010000000015
-1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf43 1.99590274244 0 99.400002 0.6899979999999971
-1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf44 2.00016617632 0 99.599998 0.4900019999999984
-1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf45 2.01610051566 0 99.599998 0.4900019999999984
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf46 2.01610051566 0 99.080002 0.9149970000000067
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf47 2.01610051566 0 99.660004 0.42999599999999705
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf48 2.00016617632 0 99.639999 0.45000099999999466
-1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf49 2.01610051566 0 99.480003 0.6099970000000013
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf50 2.00016617632 0 98.400002 1.9349969999999956
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf51 2.01610051566 0 98.540001 1.724998499999991
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf52 2.01610051566 0 99.080002 0.9149970000000067
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf53 2.00016617632 0 99.660004 0.42999599999999705
-1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf54 2.01610051566 0 99.660004 0.42999599999999705
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf55 1.97610564729 0 99.599998 0.4900019999999984
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf56 2.01610051566 0 98.900002 1.1849969999999956
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf57 1.99590274244 0 99.099998 0.8850029999999975
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf58 2.01610051566 0 99.580002 0.5099980000000045
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf59 1.97610564729 0 99.080002 0.9149970000000067
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf60 2.01610051566 0 98.959999 1.0950015000000022
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf61 2.01610051566 0 99.220001 0.7049985000000021
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf62 2.01610051566 0 98.839996 1.2750059999999976
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf63 1.99590274244 0 98.940002 1.1249969999999863
-1 gpu conv samp 34 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 34 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf64 1.97610564729 0 99.379997 0.46500449999999205
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf65 2.00016617632 0 99.559998 0.5300020000000046
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf66 2.00016617632 0 99.239998 0.6750029999999967
-1 gpu conv perf 30 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 30 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf67 2.01610051566 0 99.459999 0.6300010000000015
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf68 2.00016617632 0 99.360001 0.49499850000000123
-1 gpu conv perf 24 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 24 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf69 2.01610051566 0 99.559998 0.5300020000000046
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf70 1.99590274244 0 99.440002 0.6499979999999909
-1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf71 2.00016617632 0 99.339996 0.5250059999999976
-1 gpu conv perf 25 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 25 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf72 2.01610051566 0 99.32 0.5550000000000068
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf73 1.97610564729 0 99.379997 0.46500449999999205
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf74 2.00016617632 0 99.019997 1.0050044999999912
-1 gpu conv perf 29 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 29 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf75 1.99590274244 0 99.260002 0.6449969999999965
-1 gpu conv samp 36 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 36 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf76 2.01610051566 0 99.099998 0.8850029999999975
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf77 1.97610564729 0 98.440002 1.8749969999999863
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf78 2.01610051566 0 98.440002 1.8749969999999863
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf79 2.01610051566 0 99.160004 0.7949939999999955
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf80 1.97610564729 0 98.480003 1.814995500000002
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf81 2.00016617632 0 99.360001 0.49499850000000123
-1 gpu conv perf 23 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 23 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf82 1.97610564729 0 99.660004 0.42999599999999705
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf83 1.99590274244 0 99.540001 0.549998999999994
-1 gpu conv samp 33 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 33 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf84 2.00016617632 0 99.199997 0.7350045000000023
-1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf85 1.97610564729 0 98.440002 1.8749969999999863
-1 gpu conv fp16 1 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv fp16 1 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf86 2.00016617632 0 99.0 1.0349999999999966
-1 gpu conv perf 28 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 28 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf87 1.99590274244 0 98.519997 1.7550044999999912
-1 gpu conv samp 35 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 35 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf88 2.01610051566 0 99.400002 0.6899979999999971
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf89 2.01610051566 0 97.760002 2.8949969999999965
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf90 2.01610051566 0 99.519997 0.5700029999999942
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf91 2.01610051566 0 99.32 0.5550000000000068
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf92 2.01610051566 0 99.580002 0.5099980000000045
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf93 2.01610051566 0 99.480003 0.6099970000000013
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf94 2.01610051566 0 98.480003 1.814995500000002
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf95 2.01610051566 0 98.540001 1.724998499999991
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf96 2.01610051566 0 97.82 2.805000000000007
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf97 2.01610051566 0 98.959999 1.0950015000000022
-1 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf98 2.01610051566 0 98.459999 1.8450015000000022
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf99 2.01610051566 0 99.660004 0.42999599999999705
-1 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf100 2.01610051566 0 99.620003 0.4699970000000008
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf101 2.01610051566 0 97.699997 2.9850045000000023
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf102 2.01610051566 0 99.040001 0.974998499999991
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf103 2.01610051566 0 98.0 2.5349999999999966
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf104 2.01610051566 0 99.160004 0.7949939999999955
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf105 2.01610051566 0 99.540001 0.549998999999994
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf106 2.01610051566 0 99.519997 0.5700029999999942
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf107 2.01610051566 0 99.099998 0.8850029999999975
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 21 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 21 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf108 2.01610051566 0 98.120003 2.354995500000001
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf109 2.01610051566 0 99.459999 0.6300010000000015
-1 gpu conv perf 26 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 31 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 26 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 31 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf110 2.01610051566 0 99.68 0.4099999999999909
-1 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf111 2.01610051566 0 98.839996 1.2750059999999976
-1 gpu conv samp 32 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv samp 32 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
 +++++
 conf112 2.01610051566 0 98.18 2.2649999999999864
-1 gpu conv perf 22 add fp16 1 pool_max fp16 1 tanh fp16 1 
-2 gpu conv perf 27 add fp16 1 pool_max fp16 1 tanh fp16 1 
+1 gpu conv perf 22 add fp16 1 tanh fp16 1 pool_max fp16 1 
+2 gpu conv perf 27 add fp16 1 tanh fp16 1 pool_max fp16 1 
 3 gpu mul fp16 1 add fp16 1 tanh fp16 1 
 4 gpu mul fp16 1 add fp16 1 tanh fp16 1 
-5 gpu softmax fp16 1
+5 gpu softmax fp32 1
 -----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt
index 948efe5bd7586727c5fe4fa7ccc73e7319bf97d6..4a14a5f2e45c83a2960deccbcd0296a6d9a2f2bc 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_confs_batch220.txt
@@ -1,87 +1,87 @@
 +++++
 conf1 1 0 83.5 0
 1 gpu conv fp32 1 
-2 gpu batchnorm fp16 1 
-3 gpu relu fp16 1 
-4 gpu group_conv fp16 1 
-5 gpu batchnorm fp16 1 
-6 gpu relu fp16 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
 7 gpu conv fp32 1 
-8 gpu batchnorm fp16 1 
-9 gpu relu fp16 1 
-10 gpu group_conv fp16 1 
-11 gpu batchnorm fp16 1 
-12 gpu relu fp16 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
 13 gpu conv fp32 1 
-14 gpu batchnorm fp16 1 
-15 gpu relu fp16 1 
-16 gpu group_conv fp16 1 
-17 gpu batchnorm fp16 1 
-18 gpu relu fp16 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
 19 gpu conv fp32 1 
-20 gpu batchnorm fp16 1 
-21 gpu relu fp16 1 
-22 gpu group_conv fp16 1 
-23 gpu batchnorm fp16 1 
-24 gpu relu fp16 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
 25 gpu conv fp32 1 
-26 gpu batchnorm fp16 1 
-27 gpu relu fp16 1 
-28 gpu group_conv fp16 1 
-29 gpu batchnorm fp16 1 
-30 gpu relu fp16 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
 31 gpu conv fp32 1 
-32 gpu batchnorm fp16 1 
-33 gpu relu fp16 1 
-34 gpu group_conv fp16 1 
-35 gpu batchnorm fp16 1 
-36 gpu relu fp16 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
 37 gpu conv fp32 1 
-38 gpu batchnorm fp16 1 
-39 gpu relu fp16 1 
-40 gpu group_conv fp16 1 
-41 gpu batchnorm fp16 1 
-42 gpu relu fp16 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
 43 gpu conv fp32 1 
-44 gpu batchnorm fp16 1 
-45 gpu relu fp16 1 
-46 gpu group_conv fp16 1 
-47 gpu batchnorm fp16 1 
-48 gpu relu fp16 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
 49 gpu conv fp32 1 
-50 gpu batchnorm fp16 1 
-51 gpu relu fp16 1 
-52 gpu group_conv fp16 1 
-53 gpu batchnorm fp16 1 
-54 gpu relu fp16 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
 55 gpu conv fp32 1 
-56 gpu batchnorm fp16 1 
-57 gpu relu fp16 1 
-58 gpu group_conv fp16 1 
-59 gpu batchnorm fp16 1 
-60 gpu relu fp16 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
 61 gpu conv fp32 1 
-62 gpu batchnorm fp16 1 
-63 gpu relu fp16 1 
-64 gpu group_conv fp16 1 
-65 gpu batchnorm fp16 1 
-66 gpu relu fp16 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
 67 gpu conv fp32 1 
-68 gpu batchnorm fp16 1 
-69 gpu relu fp16 1 
-70 gpu group_conv fp16 1 
-71 gpu batchnorm fp16 1 
-72 gpu relu fp16 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
 73 gpu conv fp32 1 
-74 gpu batchnorm fp16 1 
-75 gpu relu fp16 1 
-76 gpu group_conv fp16 1 
-77 gpu batchnorm fp16 1 
-78 gpu relu fp16 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
 79 gpu conv fp32 1 
-80 gpu batchnorm fp16 1 
-81 gpu relu fp16 1 
-82 gpu pool_mean fp16 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
 83 gpu mul fp32 1 add fp32 1 
 84 gpu softmax fp32 1
 -----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt
index 66833d06b3af9ad7c4bcefdbea9c2e977eeea378..86b061f3d9ff5b75a9580ae65afd9ff6c20f9701 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_pareto_confs_batch220.txt
@@ -1,87 +1,87 @@
 +++++
 conf1 1 0 83.5 0
 1 gpu conv fp32 1 
-2 gpu batchnorm fp16 1 
-3 gpu relu fp16 1 
-4 gpu group_conv fp16 1 
-5 gpu batchnorm fp16 1 
-6 gpu relu fp16 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
 7 gpu conv fp32 1 
-8 gpu batchnorm fp16 1 
-9 gpu relu fp16 1 
-10 gpu group_conv fp16 1 
-11 gpu batchnorm fp16 1 
-12 gpu relu fp16 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
 13 gpu conv fp32 1 
-14 gpu batchnorm fp16 1 
-15 gpu relu fp16 1 
-16 gpu group_conv fp16 1 
-17 gpu batchnorm fp16 1 
-18 gpu relu fp16 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
 19 gpu conv fp32 1 
-20 gpu batchnorm fp16 1 
-21 gpu relu fp16 1 
-22 gpu group_conv fp16 1 
-23 gpu batchnorm fp16 1 
-24 gpu relu fp16 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
 25 gpu conv fp32 1 
-26 gpu batchnorm fp16 1 
-27 gpu relu fp16 1 
-28 gpu group_conv fp16 1 
-29 gpu batchnorm fp16 1 
-30 gpu relu fp16 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
 31 gpu conv fp32 1 
-32 gpu batchnorm fp16 1 
-33 gpu relu fp16 1 
-34 gpu group_conv fp16 1 
-35 gpu batchnorm fp16 1 
-36 gpu relu fp16 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
 37 gpu conv fp32 1 
-38 gpu batchnorm fp16 1 
-39 gpu relu fp16 1 
-40 gpu group_conv fp16 1 
-41 gpu batchnorm fp16 1 
-42 gpu relu fp16 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
 43 gpu conv fp32 1 
-44 gpu batchnorm fp16 1 
-45 gpu relu fp16 1 
-46 gpu group_conv fp16 1 
-47 gpu batchnorm fp16 1 
-48 gpu relu fp16 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
 49 gpu conv fp32 1 
-50 gpu batchnorm fp16 1 
-51 gpu relu fp16 1 
-52 gpu group_conv fp16 1 
-53 gpu batchnorm fp16 1 
-54 gpu relu fp16 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
 55 gpu conv fp32 1 
-56 gpu batchnorm fp16 1 
-57 gpu relu fp16 1 
-58 gpu group_conv fp16 1 
-59 gpu batchnorm fp16 1 
-60 gpu relu fp16 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
 61 gpu conv fp32 1 
-62 gpu batchnorm fp16 1 
-63 gpu relu fp16 1 
-64 gpu group_conv fp16 1 
-65 gpu batchnorm fp16 1 
-66 gpu relu fp16 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
 67 gpu conv fp32 1 
-68 gpu batchnorm fp16 1 
-69 gpu relu fp16 1 
-70 gpu group_conv fp16 1 
-71 gpu batchnorm fp16 1 
-72 gpu relu fp16 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
 73 gpu conv fp32 1 
-74 gpu batchnorm fp16 1 
-75 gpu relu fp16 1 
-76 gpu group_conv fp16 1 
-77 gpu batchnorm fp16 1 
-78 gpu relu fp16 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
 79 gpu conv fp32 1 
-80 gpu batchnorm fp16 1 
-81 gpu relu fp16 1 
-82 gpu pool_mean fp16 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
 83 gpu mul fp32 1 add fp32 1 
 84 gpu softmax fp32 1
 -----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
index baffc185452ce288432fa55e3d8ad7ced9ff44d2..3b628d570fcb1884cfa10371a2aaf6856a652d1e 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
@@ -1,88 +1,88 @@
 +++++
-conf1 4.15413017186 0 83.163334475 0.5049982875000012
-1 gpu conv fp16 1 
-2 gpu batchnorm fp16 1 
-3 gpu relu fp16 1 
-4 gpu group_conv fp16 1 
-5 gpu batchnorm fp16 1 
-6 gpu relu fp16 1 
-7 gpu conv fp16 1 
-8 gpu batchnorm fp16 1 
-9 gpu relu fp16 1 
-10 gpu group_conv fp16 1 
-11 gpu batchnorm fp16 1 
-12 gpu relu fp16 1 
-13 promise swing_level 5 
-14 gpu batchnorm fp16 1 
-15 gpu relu fp16 1 
-16 gpu group_conv fp16 1 
-17 gpu batchnorm fp16 1 
-18 gpu relu fp16 1 
-19 gpu conv fp16 1 
-20 gpu batchnorm fp16 1 
-21 gpu relu fp16 1 
-22 gpu group_conv fp16 1 
-23 gpu batchnorm fp16 1 
-24 gpu relu fp16 1 
-25 promise swing_level 7 
-26 gpu batchnorm fp16 1 
-27 gpu relu fp16 1 
-28 gpu group_conv fp16 1 
-29 gpu batchnorm fp16 1 
-30 gpu relu fp16 1 
-31 gpu conv fp16 1 
-32 gpu batchnorm fp16 1 
-33 gpu relu fp16 1 
-34 gpu group_conv fp16 1 
-35 gpu batchnorm fp16 1 
-36 gpu relu fp16 1 
-37 promise swing_level 5 
-38 gpu batchnorm fp16 1 
-39 gpu relu fp16 1 
-40 gpu group_conv fp16 1 
-41 gpu batchnorm fp16 1 
-42 gpu relu fp16 1 
-43 gpu conv fp16 1 
-44 gpu batchnorm fp16 1 
-45 gpu relu fp16 1 
-46 gpu group_conv fp16 1 
-47 gpu batchnorm fp16 1 
-48 gpu relu fp16 1 
-49 gpu conv perf 25 
-50 gpu batchnorm fp16 1 
-51 gpu relu fp16 1 
-52 gpu group_conv fp16 1 
-53 gpu batchnorm fp16 1 
-54 gpu relu fp16 1 
-55 gpu conv perf 24 
-56 gpu batchnorm fp16 1 
-57 gpu relu fp16 1 
-58 gpu group_conv fp16 1 
-59 gpu batchnorm fp16 1 
-60 gpu relu fp16 1 
-61 promise swing_level 5 
-62 gpu batchnorm fp16 1 
-63 gpu relu fp16 1 
-64 gpu group_conv fp16 1 
-65 gpu batchnorm fp16 1 
-66 gpu relu fp16 1 
-67 gpu conv fp16 1 
-68 gpu batchnorm fp16 1 
-69 gpu relu fp16 1 
-70 gpu group_conv fp16 1 
-71 gpu batchnorm fp16 1 
-72 gpu relu fp16 1 
-73 promise swing_level 6 
-74 gpu batchnorm fp16 1 
-75 gpu relu fp16 1 
-76 gpu group_conv fp16 1 
-77 gpu batchnorm fp16 1 
-78 gpu relu fp16 1 
-79 promise swing_level 5 
-80 gpu batchnorm fp16 1 
-81 gpu relu fp16 1 
-82 gpu pool_mean fp16 1 
-83 promise swing_level 3 
+conf1 1 0 83.5 0
+1 gpu conv fp32 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
+7 gpu conv fp32 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
+13 gpu conv fp32 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
+19 gpu conv fp32 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
+25 gpu conv fp32 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
+31 gpu conv fp32 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
+37 gpu conv fp32 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
+43 gpu conv fp32 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
+49 gpu conv fp32 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
+55 gpu conv fp32 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
+61 gpu conv fp32 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
+67 gpu conv fp32 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
+73 gpu conv fp32 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
+79 gpu conv fp32 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
+83 gpu mul fp32 1 add fp32 1 
 84 gpu softmax fp32 1
 -----
 +++++
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt
index fa5a561bf6fd17c4b2ce372884ac02524ce135f5..ff7fdbf108c1cbca0154d6c300cd3ebbdaf7cd6d 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_multi2.txt
@@ -1,87 +1,87 @@
 +++++
 conf1 1 0 83.5 0
 1 gpu conv fp32 1 
-2 gpu batchnorm fp16 1 
-3 gpu relu fp16 1 
-4 gpu group_conv fp16 1 
-5 gpu batchnorm fp16 1 
-6 gpu relu fp16 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
 7 gpu conv fp32 1 
-8 gpu batchnorm fp16 1 
-9 gpu relu fp16 1 
-10 gpu group_conv fp16 1 
-11 gpu batchnorm fp16 1 
-12 gpu relu fp16 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
 13 gpu conv fp32 1 
-14 gpu batchnorm fp16 1 
-15 gpu relu fp16 1 
-16 gpu group_conv fp16 1 
-17 gpu batchnorm fp16 1 
-18 gpu relu fp16 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
 19 gpu conv fp32 1 
-20 gpu batchnorm fp16 1 
-21 gpu relu fp16 1 
-22 gpu group_conv fp16 1 
-23 gpu batchnorm fp16 1 
-24 gpu relu fp16 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
 25 gpu conv fp32 1 
-26 gpu batchnorm fp16 1 
-27 gpu relu fp16 1 
-28 gpu group_conv fp16 1 
-29 gpu batchnorm fp16 1 
-30 gpu relu fp16 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
 31 gpu conv fp32 1 
-32 gpu batchnorm fp16 1 
-33 gpu relu fp16 1 
-34 gpu group_conv fp16 1 
-35 gpu batchnorm fp16 1 
-36 gpu relu fp16 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
 37 gpu conv fp32 1 
-38 gpu batchnorm fp16 1 
-39 gpu relu fp16 1 
-40 gpu group_conv fp16 1 
-41 gpu batchnorm fp16 1 
-42 gpu relu fp16 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
 43 gpu conv fp32 1 
-44 gpu batchnorm fp16 1 
-45 gpu relu fp16 1 
-46 gpu group_conv fp16 1 
-47 gpu batchnorm fp16 1 
-48 gpu relu fp16 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
 49 gpu conv fp32 1 
-50 gpu batchnorm fp16 1 
-51 gpu relu fp16 1 
-52 gpu group_conv fp16 1 
-53 gpu batchnorm fp16 1 
-54 gpu relu fp16 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
 55 gpu conv fp32 1 
-56 gpu batchnorm fp16 1 
-57 gpu relu fp16 1 
-58 gpu group_conv fp16 1 
-59 gpu batchnorm fp16 1 
-60 gpu relu fp16 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
 61 gpu conv fp32 1 
-62 gpu batchnorm fp16 1 
-63 gpu relu fp16 1 
-64 gpu group_conv fp16 1 
-65 gpu batchnorm fp16 1 
-66 gpu relu fp16 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
 67 gpu conv fp32 1 
-68 gpu batchnorm fp16 1 
-69 gpu relu fp16 1 
-70 gpu group_conv fp16 1 
-71 gpu batchnorm fp16 1 
-72 gpu relu fp16 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
 73 gpu conv fp32 1 
-74 gpu batchnorm fp16 1 
-75 gpu relu fp16 1 
-76 gpu group_conv fp16 1 
-77 gpu batchnorm fp16 1 
-78 gpu relu fp16 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
 79 gpu conv fp32 1 
-80 gpu batchnorm fp16 1 
-81 gpu relu fp16 1 
-82 gpu pool_mean fp16 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
 83 gpu mul fp32 1 add fp32 1 
 84 gpu softmax fp32 1
 -----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt
index bf55690f22ad5f9a3de72bd16d4907d8099512a9..04d1491bc7ddcfd94ce837cc830fa0874496842e 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/data/autotuner_data/tuner_promise_confs_batch220_single.txt
@@ -1,87 +1,87 @@
 +++++
 conf1 1 0 83.5 0
 1 gpu conv fp32 1 
-2 gpu batchnorm fp16 1 
-3 gpu relu fp16 1 
-4 gpu group_conv fp16 1 
-5 gpu batchnorm fp16 1 
-6 gpu relu fp16 1 
+2 gpu batchnorm fp32 1 
+3 gpu relu fp32 1 
+4 gpu group_conv fp32 1 
+5 gpu batchnorm fp32 1 
+6 gpu relu fp32 1 
 7 gpu conv fp32 1 
-8 gpu batchnorm fp16 1 
-9 gpu relu fp16 1 
-10 gpu group_conv fp16 1 
-11 gpu batchnorm fp16 1 
-12 gpu relu fp16 1 
+8 gpu batchnorm fp32 1 
+9 gpu relu fp32 1 
+10 gpu group_conv fp32 1 
+11 gpu batchnorm fp32 1 
+12 gpu relu fp32 1 
 13 gpu conv fp32 1 
-14 gpu batchnorm fp16 1 
-15 gpu relu fp16 1 
-16 gpu group_conv fp16 1 
-17 gpu batchnorm fp16 1 
-18 gpu relu fp16 1 
+14 gpu batchnorm fp32 1 
+15 gpu relu fp32 1 
+16 gpu group_conv fp32 1 
+17 gpu batchnorm fp32 1 
+18 gpu relu fp32 1 
 19 gpu conv fp32 1 
-20 gpu batchnorm fp16 1 
-21 gpu relu fp16 1 
-22 gpu group_conv fp16 1 
-23 gpu batchnorm fp16 1 
-24 gpu relu fp16 1 
+20 gpu batchnorm fp32 1 
+21 gpu relu fp32 1 
+22 gpu group_conv fp32 1 
+23 gpu batchnorm fp32 1 
+24 gpu relu fp32 1 
 25 gpu conv fp32 1 
-26 gpu batchnorm fp16 1 
-27 gpu relu fp16 1 
-28 gpu group_conv fp16 1 
-29 gpu batchnorm fp16 1 
-30 gpu relu fp16 1 
+26 gpu batchnorm fp32 1 
+27 gpu relu fp32 1 
+28 gpu group_conv fp32 1 
+29 gpu batchnorm fp32 1 
+30 gpu relu fp32 1 
 31 gpu conv fp32 1 
-32 gpu batchnorm fp16 1 
-33 gpu relu fp16 1 
-34 gpu group_conv fp16 1 
-35 gpu batchnorm fp16 1 
-36 gpu relu fp16 1 
+32 gpu batchnorm fp32 1 
+33 gpu relu fp32 1 
+34 gpu group_conv fp32 1 
+35 gpu batchnorm fp32 1 
+36 gpu relu fp32 1 
 37 gpu conv fp32 1 
-38 gpu batchnorm fp16 1 
-39 gpu relu fp16 1 
-40 gpu group_conv fp16 1 
-41 gpu batchnorm fp16 1 
-42 gpu relu fp16 1 
+38 gpu batchnorm fp32 1 
+39 gpu relu fp32 1 
+40 gpu group_conv fp32 1 
+41 gpu batchnorm fp32 1 
+42 gpu relu fp32 1 
 43 gpu conv fp32 1 
-44 gpu batchnorm fp16 1 
-45 gpu relu fp16 1 
-46 gpu group_conv fp16 1 
-47 gpu batchnorm fp16 1 
-48 gpu relu fp16 1 
+44 gpu batchnorm fp32 1 
+45 gpu relu fp32 1 
+46 gpu group_conv fp32 1 
+47 gpu batchnorm fp32 1 
+48 gpu relu fp32 1 
 49 gpu conv fp32 1 
-50 gpu batchnorm fp16 1 
-51 gpu relu fp16 1 
-52 gpu group_conv fp16 1 
-53 gpu batchnorm fp16 1 
-54 gpu relu fp16 1 
+50 gpu batchnorm fp32 1 
+51 gpu relu fp32 1 
+52 gpu group_conv fp32 1 
+53 gpu batchnorm fp32 1 
+54 gpu relu fp32 1 
 55 gpu conv fp32 1 
-56 gpu batchnorm fp16 1 
-57 gpu relu fp16 1 
-58 gpu group_conv fp16 1 
-59 gpu batchnorm fp16 1 
-60 gpu relu fp16 1 
+56 gpu batchnorm fp32 1 
+57 gpu relu fp32 1 
+58 gpu group_conv fp32 1 
+59 gpu batchnorm fp32 1 
+60 gpu relu fp32 1 
 61 gpu conv fp32 1 
-62 gpu batchnorm fp16 1 
-63 gpu relu fp16 1 
-64 gpu group_conv fp16 1 
-65 gpu batchnorm fp16 1 
-66 gpu relu fp16 1 
+62 gpu batchnorm fp32 1 
+63 gpu relu fp32 1 
+64 gpu group_conv fp32 1 
+65 gpu batchnorm fp32 1 
+66 gpu relu fp32 1 
 67 gpu conv fp32 1 
-68 gpu batchnorm fp16 1 
-69 gpu relu fp16 1 
-70 gpu group_conv fp16 1 
-71 gpu batchnorm fp16 1 
-72 gpu relu fp16 1 
+68 gpu batchnorm fp32 1 
+69 gpu relu fp32 1 
+70 gpu group_conv fp32 1 
+71 gpu batchnorm fp32 1 
+72 gpu relu fp32 1 
 73 gpu conv fp32 1 
-74 gpu batchnorm fp16 1 
-75 gpu relu fp16 1 
-76 gpu group_conv fp16 1 
-77 gpu batchnorm fp16 1 
-78 gpu relu fp16 1 
+74 gpu batchnorm fp32 1 
+75 gpu relu fp32 1 
+76 gpu group_conv fp32 1 
+77 gpu batchnorm fp32 1 
+78 gpu relu fp32 1 
 79 gpu conv fp32 1 
-80 gpu batchnorm fp16 1 
-81 gpu relu fp16 1 
-82 gpu pool_mean fp16 1 
+80 gpu batchnorm fp32 1 
+81 gpu relu fp32 1 
+82 gpu pool_mean fp32 1 
 83 gpu mul fp32 1 add fp32 1 
 84 gpu softmax fp32 1
 -----
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt
index 99aac992148120d1b4c9937b4d5464a137806d4a..5a0463b97eb4e36e097cfcef5383474e85ab6076 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_confs_batch220.txt
@@ -1,46 +1,46 @@
 +++++
 conf1 1 0 89.4 0
-1 gpu conv fp32 1 add fp32 1 relu fp32 1 
-2 gpu conv fp32 1 add fp32 1 relu fp32 1 
-3 gpu conv fp32 1 add fp32 1 
-4 gpu add fp16 1 
-5 gpu relu fp16 1 
-6 gpu conv fp32 1 add fp32 1 relu fp32 1 
-7 gpu conv fp32 1 add fp32 1 
-8 gpu add fp16 1 
-9 gpu relu fp16 1 
-10 gpu conv fp32 1 add fp32 1 relu fp32 1 
-11 gpu conv fp32 1 add fp32 1 
-12 gpu add fp16 1 
-13 gpu relu fp16 1 
-14 gpu conv fp32 1 add fp32 1 relu fp32 1 
-15 gpu conv fp32 1 add fp32 1 
-16 gpu conv fp32 1 add fp32 1 
-17 gpu add fp16 1 
-18 gpu relu fp16 1 
-19 gpu conv fp32 1 add fp32 1 relu fp32 1 
-20 gpu conv fp32 1 add fp32 1 
-21 gpu add fp16 1 
-22 gpu relu fp16 1 
-23 gpu conv fp32 1 add fp32 1 relu fp32 1 
-24 gpu conv fp32 1 add fp32 1 
-25 gpu add fp16 1 
-26 gpu relu fp16 1 
-27 gpu conv fp32 1 add fp32 1 relu fp32 1 
-28 gpu conv fp32 1 add fp32 1 
-29 gpu conv fp32 1 add fp32 1 
-30 gpu add fp16 1 
-31 gpu relu fp16 1 
-32 gpu conv fp32 1 add fp32 1 relu fp32 1 
-33 gpu conv fp32 1 add fp32 1 
-34 gpu add fp16 1 
-35 gpu relu fp16 1 
-36 gpu conv fp32 1 add fp32 1 relu fp32 1 
-37 gpu conv fp32 1 add fp32 1 
-38 gpu add fp16 1 
-39 gpu relu fp16 1 
-40 gpu pool_max fp16 1 
-41 gpu mul fp32 1 add fp32 1 
+1 gpu conv fp32 1 add fp32 1 relu fp32 1
+2 gpu conv fp32 1 add fp32 1 relu fp32 1
+3 gpu conv fp32 1 add fp32 1
+4 gpu add fp32 1
+5 gpu relu fp32 1
+6 gpu conv fp32 1 add fp32 1 relu fp32 1
+7 gpu conv fp32 1 add fp32 1
+8 gpu add fp32 1
+9 gpu relu fp32 1
+10 gpu conv fp32 1 add fp32 1 relu fp32 1
+11 gpu conv fp32 1 add fp32 1
+12 gpu add fp32 1
+13 gpu relu fp32 1
+14 gpu conv fp32 1 add fp32 1 relu fp32 1
+15 gpu conv fp32 1 add fp32 1
+16 gpu conv fp32 1 add fp32 1
+17 gpu add fp32 1
+18 gpu relu fp32 1
+19 gpu conv fp32 1 add fp32 1 relu fp32 1
+20 gpu conv fp32 1 add fp32 1
+21 gpu add fp32 1
+22 gpu relu fp32 1
+23 gpu conv fp32 1 add fp32 1 relu fp32 1
+24 gpu conv fp32 1 add fp32 1
+25 gpu add fp32 1
+26 gpu relu fp32 1
+27 gpu conv fp32 1 add fp32 1 relu fp32 1
+28 gpu conv fp32 1 add fp32 1
+29 gpu conv fp32 1 add fp32 1
+30 gpu add fp32 1
+31 gpu relu fp32 1
+32 gpu conv fp32 1 add fp32 1 relu fp32 1
+33 gpu conv fp32 1 add fp32 1
+34 gpu add fp32 1
+35 gpu relu fp32 1
+36 gpu conv fp32 1 add fp32 1 relu fp32 1
+37 gpu conv fp32 1 add fp32 1
+38 gpu add fp32 1
+39 gpu relu fp32 1
+40 gpu pool_max fp32 1
+41 gpu mul fp32 1 add fp32 1
 42 gpu softmax fp32 1
 -----
 +++++
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt
index eaafddc7dd76089812103a759497386dff80854c..ccc9576535cf2b1f05427fc4cb2247dbb0958c12 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_pareto_confs_batch220.txt
@@ -1,46 +1,46 @@
 +++++
 conf1 1 0 89.4 0
-1 gpu conv fp32 1 add fp32 1 relu fp32 1 
-2 gpu conv fp32 1 add fp32 1 relu fp32 1 
-3 gpu conv fp32 1 add fp32 1 
-4 gpu add fp16 1 
-5 gpu relu fp16 1 
-6 gpu conv fp32 1 add fp32 1 relu fp32 1 
-7 gpu conv fp32 1 add fp32 1 
-8 gpu add fp16 1 
-9 gpu relu fp16 1 
-10 gpu conv fp32 1 add fp32 1 relu fp32 1 
-11 gpu conv fp32 1 add fp32 1 
-12 gpu add fp16 1 
-13 gpu relu fp16 1 
-14 gpu conv fp32 1 add fp32 1 relu fp32 1 
-15 gpu conv fp32 1 add fp32 1 
-16 gpu conv fp32 1 add fp32 1 
-17 gpu add fp16 1 
-18 gpu relu fp16 1 
-19 gpu conv fp32 1 add fp32 1 relu fp32 1 
-20 gpu conv fp32 1 add fp32 1 
-21 gpu add fp16 1 
-22 gpu relu fp16 1 
-23 gpu conv fp32 1 add fp32 1 relu fp32 1 
-24 gpu conv fp32 1 add fp32 1 
-25 gpu add fp16 1 
-26 gpu relu fp16 1 
-27 gpu conv fp32 1 add fp32 1 relu fp32 1 
-28 gpu conv fp32 1 add fp32 1 
-29 gpu conv fp32 1 add fp32 1 
-30 gpu add fp16 1 
-31 gpu relu fp16 1 
-32 gpu conv fp32 1 add fp32 1 relu fp32 1 
-33 gpu conv fp32 1 add fp32 1 
-34 gpu add fp16 1 
-35 gpu relu fp16 1 
-36 gpu conv fp32 1 add fp32 1 relu fp32 1 
-37 gpu conv fp32 1 add fp32 1 
-38 gpu add fp16 1 
-39 gpu relu fp16 1 
-40 gpu pool_max fp16 1 
-41 gpu mul fp32 1 add fp32 1 
+1 gpu conv fp32 1 add fp32 1 relu fp32 1
+2 gpu conv fp32 1 add fp32 1 relu fp32 1
+3 gpu conv fp32 1 add fp32 1
+4 gpu add fp32 1
+5 gpu relu fp32 1
+6 gpu conv fp32 1 add fp32 1 relu fp32 1
+7 gpu conv fp32 1 add fp32 1
+8 gpu add fp32 1
+9 gpu relu fp32 1
+10 gpu conv fp32 1 add fp32 1 relu fp32 1
+11 gpu conv fp32 1 add fp32 1
+12 gpu add fp32 1
+13 gpu relu fp32 1
+14 gpu conv fp32 1 add fp32 1 relu fp32 1
+15 gpu conv fp32 1 add fp32 1
+16 gpu conv fp32 1 add fp32 1
+17 gpu add fp32 1
+18 gpu relu fp32 1
+19 gpu conv fp32 1 add fp32 1 relu fp32 1
+20 gpu conv fp32 1 add fp32 1
+21 gpu add fp32 1
+22 gpu relu fp32 1
+23 gpu conv fp32 1 add fp32 1 relu fp32 1
+24 gpu conv fp32 1 add fp32 1
+25 gpu add fp32 1
+26 gpu relu fp32 1
+27 gpu conv fp32 1 add fp32 1 relu fp32 1
+28 gpu conv fp32 1 add fp32 1
+29 gpu conv fp32 1 add fp32 1
+30 gpu add fp32 1
+31 gpu relu fp32 1
+32 gpu conv fp32 1 add fp32 1 relu fp32 1
+33 gpu conv fp32 1 add fp32 1
+34 gpu add fp32 1
+35 gpu relu fp32 1
+36 gpu conv fp32 1 add fp32 1 relu fp32 1
+37 gpu conv fp32 1 add fp32 1
+38 gpu add fp32 1
+39 gpu relu fp32 1
+40 gpu pool_max fp32 1
+41 gpu mul fp32 1 add fp32 1
 42 gpu softmax fp32 1
 -----
 +++++
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
index 793a33e54ea2e49a724fb5ee75de14d6be608725..fac96ced244ed77d41ed236c60a5aa0f0cc84c30 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_multi.txt
@@ -1,46 +1,46 @@
 +++++
 conf1 1 0 89.4 0
-1 gpu conv fp32 1 add fp32 1 relu fp32 1 
-2 gpu conv fp32 1 add fp32 1 relu fp32 1 
-3 gpu conv fp32 1 add fp32 1 
-4 gpu add fp16 1 
-5 gpu relu fp16 1 
-6 gpu conv fp32 1 add fp32 1 relu fp32 1 
-7 gpu conv fp32 1 add fp32 1 
-8 gpu add fp16 1 
-9 gpu relu fp16 1 
-10 gpu conv fp32 1 add fp32 1 relu fp32 1 
-11 gpu conv fp32 1 add fp32 1 
-12 gpu add fp16 1 
-13 gpu relu fp16 1 
-14 gpu conv fp32 1 add fp32 1 relu fp32 1 
-15 gpu conv fp32 1 add fp32 1 
-16 gpu conv fp32 1 add fp32 1 
-17 gpu add fp16 1 
-18 gpu relu fp16 1 
-19 gpu conv fp32 1 add fp32 1 relu fp32 1 
-20 gpu conv fp32 1 add fp32 1 
-21 gpu add fp16 1 
-22 gpu relu fp16 1 
-23 gpu conv fp32 1 add fp32 1 relu fp32 1 
-24 gpu conv fp32 1 add fp32 1 
-25 gpu add fp16 1 
-26 gpu relu fp16 1 
-27 gpu conv fp32 1 add fp32 1 relu fp32 1 
-28 gpu conv fp32 1 add fp32 1 
-29 gpu conv fp32 1 add fp32 1 
-30 gpu add fp16 1 
-31 gpu relu fp16 1 
-32 gpu conv fp32 1 add fp32 1 relu fp32 1 
-33 gpu conv fp32 1 add fp32 1 
-34 gpu add fp16 1 
-35 gpu relu fp16 1 
-36 gpu conv fp32 1 add fp32 1 relu fp32 1 
-37 gpu conv fp32 1 add fp32 1 
-38 gpu add fp16 1 
-39 gpu relu fp16 1 
-40 gpu pool_max fp16 1 
-41 gpu mul fp32 1 add fp32 1 
+1 gpu conv fp32 1 add fp32 1 relu fp32 1
+2 gpu conv fp32 1 add fp32 1 relu fp32 1
+3 gpu conv fp32 1 add fp32 1
+4 gpu add fp32 1
+5 gpu relu fp32 1
+6 gpu conv fp32 1 add fp32 1 relu fp32 1
+7 gpu conv fp32 1 add fp32 1
+8 gpu add fp32 1
+9 gpu relu fp32 1
+10 gpu conv fp32 1 add fp32 1 relu fp32 1
+11 gpu conv fp32 1 add fp32 1
+12 gpu add fp32 1
+13 gpu relu fp32 1
+14 gpu conv fp32 1 add fp32 1 relu fp32 1
+15 gpu conv fp32 1 add fp32 1
+16 gpu conv fp32 1 add fp32 1
+17 gpu add fp32 1
+18 gpu relu fp32 1
+19 gpu conv fp32 1 add fp32 1 relu fp32 1
+20 gpu conv fp32 1 add fp32 1
+21 gpu add fp32 1
+22 gpu relu fp32 1
+23 gpu conv fp32 1 add fp32 1 relu fp32 1
+24 gpu conv fp32 1 add fp32 1
+25 gpu add fp32 1
+26 gpu relu fp32 1
+27 gpu conv fp32 1 add fp32 1 relu fp32 1
+28 gpu conv fp32 1 add fp32 1
+29 gpu conv fp32 1 add fp32 1
+30 gpu add fp32 1
+31 gpu relu fp32 1
+32 gpu conv fp32 1 add fp32 1 relu fp32 1
+33 gpu conv fp32 1 add fp32 1
+34 gpu add fp32 1
+35 gpu relu fp32 1
+36 gpu conv fp32 1 add fp32 1 relu fp32 1
+37 gpu conv fp32 1 add fp32 1
+38 gpu add fp32 1
+39 gpu relu fp32 1
+40 gpu pool_max fp32 1
+41 gpu mul fp32 1 add fp32 1
 42 gpu softmax fp32 1
 -----
 +++++
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt
index 714c965a13922470adbd5d44461c794fb3729b2f..0f0348b8f264eb606bb274cef7b5ba206e03c705 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/autotuner_data/tuner_promise_confs_batch220_single.txt
@@ -1,46 +1,46 @@
 +++++
 conf1 1 0 89.4 0
-1 gpu conv fp32 1 add fp32 1 relu fp32 1 
-2 gpu conv fp32 1 add fp32 1 relu fp32 1 
-3 gpu conv fp32 1 add fp32 1 
-4 gpu add fp16 1 
-5 gpu relu fp16 1 
-6 gpu conv fp32 1 add fp32 1 relu fp32 1 
-7 gpu conv fp32 1 add fp32 1 
-8 gpu add fp16 1 
-9 gpu relu fp16 1 
-10 gpu conv fp32 1 add fp32 1 relu fp32 1 
-11 gpu conv fp32 1 add fp32 1 
-12 gpu add fp16 1 
-13 gpu relu fp16 1 
-14 gpu conv fp32 1 add fp32 1 relu fp32 1 
-15 gpu conv fp32 1 add fp32 1 
-16 gpu conv fp32 1 add fp32 1 
-17 gpu add fp16 1 
-18 gpu relu fp16 1 
-19 gpu conv fp32 1 add fp32 1 relu fp32 1 
-20 gpu conv fp32 1 add fp32 1 
-21 gpu add fp16 1 
-22 gpu relu fp16 1 
-23 gpu conv fp32 1 add fp32 1 relu fp32 1 
-24 gpu conv fp32 1 add fp32 1 
-25 gpu add fp16 1 
-26 gpu relu fp16 1 
-27 gpu conv fp32 1 add fp32 1 relu fp32 1 
-28 gpu conv fp32 1 add fp32 1 
-29 gpu conv fp32 1 add fp32 1 
-30 gpu add fp16 1 
-31 gpu relu fp16 1 
-32 gpu conv fp32 1 add fp32 1 relu fp32 1 
-33 gpu conv fp32 1 add fp32 1 
-34 gpu add fp16 1 
-35 gpu relu fp16 1 
-36 gpu conv fp32 1 add fp32 1 relu fp32 1 
-37 gpu conv fp32 1 add fp32 1 
-38 gpu add fp16 1 
-39 gpu relu fp16 1 
-40 gpu pool_max fp16 1 
-41 gpu mul fp32 1 add fp32 1 
+1 gpu conv fp32 1 add fp32 1 relu fp32 1
+2 gpu conv fp32 1 add fp32 1 relu fp32 1
+3 gpu conv fp32 1 add fp32 1
+4 gpu add fp32 1
+5 gpu relu fp32 1
+6 gpu conv fp32 1 add fp32 1 relu fp32 1
+7 gpu conv fp32 1 add fp32 1
+8 gpu add fp32 1
+9 gpu relu fp32 1
+10 gpu conv fp32 1 add fp32 1 relu fp32 1
+11 gpu conv fp32 1 add fp32 1
+12 gpu add fp32 1
+13 gpu relu fp32 1
+14 gpu conv fp32 1 add fp32 1 relu fp32 1
+15 gpu conv fp32 1 add fp32 1
+16 gpu conv fp32 1 add fp32 1
+17 gpu add fp32 1
+18 gpu relu fp32 1
+19 gpu conv fp32 1 add fp32 1 relu fp32 1
+20 gpu conv fp32 1 add fp32 1
+21 gpu add fp32 1
+22 gpu relu fp32 1
+23 gpu conv fp32 1 add fp32 1 relu fp32 1
+24 gpu conv fp32 1 add fp32 1
+25 gpu add fp32 1
+26 gpu relu fp32 1
+27 gpu conv fp32 1 add fp32 1 relu fp32 1
+28 gpu conv fp32 1 add fp32 1
+29 gpu conv fp32 1 add fp32 1
+30 gpu add fp32 1
+31 gpu relu fp32 1
+32 gpu conv fp32 1 add fp32 1 relu fp32 1
+33 gpu conv fp32 1 add fp32 1
+34 gpu add fp32 1
+35 gpu relu fp32 1
+36 gpu conv fp32 1 add fp32 1 relu fp32 1
+37 gpu conv fp32 1 add fp32 1
+38 gpu add fp32 1
+39 gpu relu fp32 1
+40 gpu pool_max fp32 1
+41 gpu mul fp32 1 add fp32 1
 42 gpu softmax fp32 1
 -----
 +++++