diff --git a/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet/alexnet_valid_soc.txt b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet/alexnet_valid_soc.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b7aeb981c745717c52c841f99672cfbd532f7cb --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet/alexnet_valid_soc.txt @@ -0,0 +1,231 @@ +2725.121326 ++++++ +conf1 1 1 78.78 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +6 gpu mul fp32 11 add fp32 1 +7 gpu softmax fp32 1 +----- ++++++ +conf2 2.1233638648528457 1.6150951710244676 78.3544 0.42560000000000286 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf3 2.051295134864554 1.6122580072322763 78.3278 0.4522000000000048 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf4 2.188609573694276 1.688911612634961 78.30120000000001 0.47879999999999256 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf5 2.0570505767108007 1.6000014977491621 78.2214 0.5585999999999984 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 265 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf6 2.009166522889861 1.5755494376470724 78.1948 0.5852000000000004 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf7 2.0188668300066377 1.5976556515195433 78.06179999999999 0.7182000000000102 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 266 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf8 2.1797184471932716 1.6767378001241562 78.06179999999999 0.7182000000000102 +1 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf9 2.064914192886025 1.6203964986881603 78.06179999999999 0.7182000000000102 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf10 2.2070171560926672 1.7194657877315815 78.0352 0.7447999999999979 +1 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 265 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf11 2.0161469236407057 1.5964768988685245 78.0086 0.7713999999999999 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf12 2.157846755426679 1.6765250202752133 78.0086 0.7713999999999999 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf13 2.0319664118931096 1.6183541826275754 77.98200000000001 0.7979999999999876 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf14 2.354997704376988 1.7779732164691666 77.98200000000001 0.7979999999999876 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv fp16 12 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf15 2.3463673263694 1.8510470086526165 77.98200000000001 0.7979999999999876 +1 gpu conv samp_fp16 264 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf16 2.284714727579521 1.7855758235498087 77.7692 1.0108000000000033 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +5 gpu conv samp_fp16 269 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf17 2.3463673263694 1.8510470086526165 77.68939999999999 1.0906000000000091 +1 gpu conv samp_fp16 264 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf18 2.427840309027486 1.9007943438562696 77.68939999999999 1.0906000000000091 +1 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 263 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf19 2.4671009475732766 1.9246545843862224 77.47659999999999 1.3034000000000106 +1 gpu conv samp_fp16 264 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf20 2.5567127702266332 1.9773019485322874 77.2638 1.5161999999999978 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf21 2.557898283218207 1.9895818051250724 77.2372 1.5427999999999997 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf22 2.557898283218207 1.9895818051250724 77.21060000000001 1.5693999999999875 +1 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- ++++++ +conf23 2.6457265307759883 2.029290916760937 77.1574 1.6226000000000056 +1 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +6 gpu mul fp16 12 add fp16 12 +7 gpu softmax fp16 12 +----- diff --git a/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet2/alexnet2_valid_soc.txt b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet2/alexnet2_valid_soc.txt new file mode 100644 index 0000000000000000000000000000000000000000..a888b5ee5a50d140f60d6579a3f6bdb6aa5ddfbd --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/alexnet2/alexnet2_valid_soc.txt @@ -0,0 +1,188 @@ +1129.3450630000002 ++++++ +conf1 1 1 84.76 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 +6 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +7 gpu mul fp32 11 add fp32 1 +8 gpu softmax fp32 1 +----- ++++++ +conf2 2.2258170210610477 1.3875307929727092 84.74 0.020000000000010232 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 151 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf3 2.3673182996864846 1.4566777038051897 84.49999999999999 0.2600000000000193 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf4 2.24614762418964 1.41800542976017 84.25999999999999 0.5000000000000142 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 158 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf5 2.304084258604824 1.4284953488024343 84.228 0.5320000000000107 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 151 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 267 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf6 2.3377766277342653 1.4440340860007412 84.228 0.5320000000000107 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +6 gpu conv fp16 12 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf7 2.24614762418964 1.41800542976017 84.17479999999999 0.5852000000000146 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 158 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf8 2.3673182996864846 1.4566777038051897 84.095 0.6650000000000063 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf9 2.2463714607055545 1.417884448648111 83.8024 0.9575999999999993 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 158 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 266 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf10 2.389025803395913 1.4732901147183992 83.77579999999999 0.9842000000000155 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf11 2.288831273542033 1.435952475412438 83.61619999999999 1.143800000000013 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 158 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf12 2.288831273542033 1.435952475412438 83.58959999999999 1.170400000000015 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 158 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf13 2.389025803395913 1.4732901147183992 83.58959999999999 1.170400000000015 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 268 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf14 2.3892790238475423 1.4731595166090572 83.4566 1.3034000000000106 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 266 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf15 2.390450803781405 1.4707319718833016 83.3768 1.3832000000000022 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 266 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 157 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf16 2.4373708430335537 1.49267343110314 83.3768 1.3832000000000022 +1 gpu conv fp16 11 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- ++++++ +conf17 2.4373708430335537 1.49267343110314 83.2704 1.48960000000001 +1 gpu conv fp16 12 add fp16 12 tanh fp16 12 +2 gpu conv perf_fp16 153 add fp16 12 tanh fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 tanh fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 261 add fp16 12 tanh fp16 12 +6 gpu conv perf_fp16 160 add fp16 12 tanh fp16 12 pool_max fp16 12 +7 gpu mul fp16 12 add fp16 12 +8 gpu softmax fp16 12 +----- diff --git a/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/resnet18/resnet18_valid_soc.txt b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/resnet18/resnet18_valid_soc.txt new file mode 100644 index 0000000000000000000000000000000000000000..942789c1c4defd1139e75209ffbcb073a2b39b30 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/resnet18/resnet18_valid_soc.txt @@ -0,0 +1,1576 @@ +2593.3013975999997 ++++++ +conf1 1 1 89.42 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 +3 gpu conv fp32 11 add fp32 1 +4 gpu add fp32 11 +5 gpu relu fp32 11 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 +8 gpu add fp32 11 +9 gpu relu fp32 11 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 +11 gpu conv fp32 11 add fp32 1 +12 gpu add fp32 11 +13 gpu relu fp32 11 +14 gpu conv fp32 11 add fp32 1 relu fp32 1 +15 gpu conv fp32 11 add fp32 1 +16 gpu conv fp32 11 add fp32 1 +17 gpu add fp32 11 +18 gpu relu fp32 11 +19 gpu conv fp32 11 add fp32 1 relu fp32 1 +20 gpu conv fp32 11 add fp32 1 +21 gpu add fp32 11 +22 gpu relu fp32 11 +23 gpu conv fp32 11 add fp32 1 relu fp32 1 +24 gpu conv fp32 11 add fp32 1 +25 gpu add fp32 11 +26 gpu relu fp32 11 +27 gpu conv fp32 11 add fp32 1 relu fp32 1 +28 gpu conv fp32 11 add fp32 1 +29 gpu conv fp32 11 add fp32 1 +30 gpu add fp32 11 +31 gpu relu fp32 11 +32 gpu conv fp32 11 add fp32 1 relu fp32 1 +33 gpu conv fp32 11 add fp32 1 +34 gpu add fp32 11 +35 gpu relu fp32 11 +36 gpu conv fp32 11 add fp32 1 relu fp32 1 +37 gpu conv fp32 11 add fp32 1 +38 gpu add fp32 11 +39 gpu relu fp32 11 +40 gpu pool_mean fp32 11 +41 gpu mul fp32 11 add fp32 1 +42 gpu softmax fp32 1 +----- ++++++ +conf2 1.8227860146926984 1.3592380545823108 88.28 1.1400000000000006 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 162 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 166 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf3 1.772745264351603 1.3340968704252147 88.2 1.2199999999999989 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 166 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf4 1.831301934833889 1.3636544094268177 88.2 1.2199999999999989 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf5 1.7541385118416233 1.323200331238725 88.12 1.2999999999999972 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 166 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf6 1.750881760437994 1.3214899710791683 88.12 1.2999999999999972 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 166 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf7 1.9207420870636576 1.4105446231099241 88.1 1.3200000000000074 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv fp16 12 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf8 1.897654446584276 1.3943617562849198 88.1 1.3200000000000074 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 262 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf9 1.9276001243246026 1.4155139358802007 88.08 1.3400000000000034 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 168 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 155 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf10 1.8877611861107602 1.3945090937373315 88.03999999999999 1.3800000000000097 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 154 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 166 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf11 1.884015904997108 1.386748889441216 87.96000000000001 1.4599999999999937 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 262 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf12 1.815742308450095 1.3541765419789824 87.83999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 262 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf13 1.928011277898605 1.414528053850526 87.83999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 155 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf14 1.8702574116471649 1.3838796270391824 87.8 1.6200000000000045 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 269 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf15 1.9390257777318618 1.4193909923193697 87.8 1.6200000000000045 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 155 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf16 1.8505712546542585 1.372601565984325 87.76 1.6599999999999966 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf17 1.931335957581042 1.4149043748735137 87.74 1.6800000000000068 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 157 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf18 1.8390656100510818 1.3668229301466752 87.68 1.7399999999999949 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf19 1.9360126662655235 1.416245073512222 87.64 1.7800000000000011 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 155 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 264 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf20 1.826739398491775 1.3609522133620269 87.62 1.7999999999999972 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 153 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv samp_fp16 262 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 165 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf21 1.8243322012642802 1.3542277148411042 87.62 1.7999999999999972 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf22 1.8245510435946863 1.3601414031759373 87.58 1.8400000000000034 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv samp_fp16 269 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf23 1.9832010015590205 1.4407797001367388 87.56 1.8599999999999994 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 261 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 155 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf24 1.831958859203629 1.3643626254848584 87.5 1.9200000000000017 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 151 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf25 1.827209961997738 1.3576190436536635 87.5 1.9200000000000017 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 262 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf26 1.9532893879837718 1.4253186875342474 87.5 1.9200000000000017 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 168 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 262 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf27 1.8598315807624513 1.376813374656673 87.48 1.9399999999999977 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf28 1.8545931630272876 1.3744725755811524 87.48 1.9399999999999977 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 267 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 152 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf29 1.9088935397779812 1.4033062374488858 87.44 1.980000000000004 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 163 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 267 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf30 1.8306014158563824 1.3613821654101905 87.44 1.980000000000004 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 164 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 265 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 168 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 262 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf31 1.9755297077095708 1.4378811225069261 87.44 1.980000000000004 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 159 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 159 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 155 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf32 1.827200177575606 1.356175543415313 87.38 2.0400000000000063 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 156 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 264 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv perf_fp16 167 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf33 1.8517276001191023 1.3729319418960464 87.38 2.0400000000000063 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +7 gpu conv fp16 12 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv samp_fp16 269 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 157 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 12 relu fp16 12 +24 gpu conv perf_fp16 160 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv samp_fp16 268 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 269 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf34 1.8938192956663813 1.3919348631813433 87.38 2.0400000000000063 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv fp16 11 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 262 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- ++++++ +conf35 1.8989539669005067 1.3938360809175603 87.36 2.0600000000000023 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv fp16 12 add fp16 12 relu fp16 12 +3 gpu conv fp16 12 add fp16 12 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 263 add fp16 12 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 12 relu fp16 12 +11 gpu conv perf_fp16 154 add fp16 12 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 12 relu fp16 12 +15 gpu conv fp16 12 add fp16 12 +16 gpu conv fp16 11 add fp16 12 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 +20 gpu conv perf_fp16 151 add fp16 12 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 157 add fp16 12 relu fp16 12 +24 gpu conv samp_fp16 268 add fp16 12 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +28 gpu conv fp16 12 add fp16 12 +29 gpu conv perf_fp16 154 add fp16 12 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 12 relu fp16 12 +33 gpu conv fp16 12 add fp16 12 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +37 gpu conv samp_fp16 262 add fp16 12 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 12 +42 gpu softmax fp16 12 +----- diff --git a/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar10/vgg16_cifar10_valid_soc.txt b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar10/vgg16_cifar10_valid_soc.txt new file mode 100644 index 0000000000000000000000000000000000000000..789f4e21cf4a778535d1df0f9f7be22c1415d672 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar10/vgg16_cifar10_valid_soc.txt @@ -0,0 +1,1027 @@ +3994.0731450000017 ++++++ +conf1 1 1 89.22 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.3049904288987464 1.6887800235455193 89.14 0.0799999999999983 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv fp16 11 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf3 2.357615734902983 1.7226289827534114 89.14 0.0799999999999983 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf4 2.3831343547359976 1.7374446557158316 88.84 0.37999999999999545 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 162 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf5 2.3696393667573616 1.7284732038695636 88.8 0.4200000000000017 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 162 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 265 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf6 2.4444787116056292 1.7833916898567774 88.58 0.6400000000000006 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf7 2.40209759505425 1.7661661942711917 88.58 0.6400000000000006 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf8 2.528892013058046 1.8332619869789675 88.08 1.1400000000000006 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf9 2.5283008295291105 1.8324605771289624 88.06 1.1599999999999966 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf10 2.5562616043247313 1.847605117430125 88.03999999999999 1.1800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf11 2.5337351216813757 1.836759334487813 88.03999999999999 1.1800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf12 2.556171297969468 1.8482604143790797 88.03999999999999 1.1800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf13 2.5562385363337343 1.8481145682015834 88.03999999999999 1.1800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf14 2.556612910921585 1.8486422226408725 88.03999999999999 1.1800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf15 2.5419253262471346 1.8395765136023223 88.02 1.2000000000000028 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 263 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf16 2.4937721600323406 1.8116328904640306 88.0 1.2199999999999989 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv perf_fp16 162 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf17 2.5545877208248187 1.8465313171321942 88.0 1.2199999999999989 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf18 2.528537397828869 1.8330988121074523 88.0 1.2199999999999989 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf19 2.531670576114998 1.8357132731685366 88.0 1.2199999999999989 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf20 2.5294693760803577 1.8335105878862015 87.98 1.2399999999999949 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 268 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf21 2.5582293136941723 1.8476583031165972 87.98 1.2399999999999949 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 156 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf22 2.556327374925176 1.8481587827658859 87.98 1.2399999999999949 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf23 2.557806470696261 1.8492020211230846 87.98 1.2399999999999949 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf24 2.5545697480449 1.8464092920718178 87.96000000000001 1.259999999999991 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 267 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf25 2.528206406642683 1.832658178797549 87.96000000000001 1.259999999999991 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf26 2.556533707152568 1.8484262997816934 87.96000000000001 1.259999999999991 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf27 2.5393059900815325 1.837123626585959 87.94 1.2800000000000011 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 265 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf28 2.5486219361262235 1.845481069177171 87.94 1.2800000000000011 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf29 2.5485321687357825 1.8461348600374907 87.94 1.2800000000000011 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf30 2.5657339222733015 1.8517901869245543 87.92 1.2999999999999972 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 263 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf31 2.581139532058275 1.860666047394923 87.92 1.2999999999999972 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf32 2.5098654459068945 1.8297655130336108 87.92 1.2999999999999972 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf33 2.528587182046725 1.8312521826965082 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 156 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf34 2.517311952294846 1.8204468250382393 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf35 2.517311952294846 1.8204468250382393 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf36 2.517311952294846 1.8204468250382393 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf37 2.517311952294846 1.8204468250382393 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf38 2.5346932948358267 1.8376287813464989 87.9 1.3199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 265 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf39 2.4914548049246 1.8095620501702707 87.86 1.3599999999999994 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv perf_fp16 162 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 268 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf40 2.5809312104420865 1.8607657818447936 87.86 1.3599999999999994 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf41 2.5120056276901925 1.824277681148882 87.83999999999999 1.3800000000000097 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 268 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf42 2.556168516896762 1.849243225747987 87.83999999999999 1.3800000000000097 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf43 2.512713457130698 1.8053797549107755 87.82 1.4000000000000057 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf44 2.509447559327321 1.8294109824358684 87.82 1.4000000000000057 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf45 2.532043246184595 1.8347717424454622 87.74 1.480000000000004 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv samp_fp16 265 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf46 2.4911011329750212 1.795311376068545 87.68 1.539999999999992 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 153 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf47 2.549746515565958 1.8283676275816687 87.66000000000001 1.559999999999988 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv fp16 12 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +9 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf48 2.51145215830771 1.8254971754777813 87.64 1.5799999999999983 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf49 2.513356522647888 1.826263067419964 87.58 1.6400000000000006 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf50 2.513356522647888 1.826263067419964 87.53999999999999 1.6800000000000068 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf51 2.4881677905203494 1.8127135485543127 87.4 1.8199999999999932 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf52 2.51145215830771 1.8254971754777813 87.36 1.8599999999999994 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf53 2.4757784613808234 1.7991027289904775 87.26 1.9599999999999937 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf54 2.5913526715019284 1.8695479088125426 87.24 1.980000000000004 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- diff --git a/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar100/vgg16_cifar100_valid_soc.txt b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar100/vgg16_cifar100_valid_soc.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef6509b99bee287bf0e3dfbaa035d51f9e3cb0ea --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/PPoPP_results/runtime_experiments/vgg16_cifar100/vgg16_cifar100_valid_soc.txt @@ -0,0 +1,210 @@ +3845.438677999999 ++++++ +conf1 1 1 68.42 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.4361074671227554 1.7555866253938424 67.22 1.2000000000000028 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv fp16 11 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 264 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf3 2.602684148359414 1.8286503060252126 67.10000000000001 1.3199999999999932 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 156 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv fp16 11 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf4 2.661880095451371 1.886369953641946 67.06 1.3599999999999994 +1 gpu conv fp16 12 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 156 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf5 2.5990656605003855 1.8588553950032938 66.84 1.5799999999999983 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 163 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf6 2.5884968081531485 1.8594972115815722 66.8 1.6200000000000045 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 165 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf7 2.4323231936537972 1.8028228076034056 66.8 1.6200000000000045 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf8 2.575472326184571 1.8375078883357683 66.72 1.7000000000000028 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv perf_fp16 161 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv fp16 11 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf9 2.4912510106198957 1.848807665058795 66.58 1.8400000000000034 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 266 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf10 2.4323231936537972 1.8028228076034056 66.53999999999999 1.8800000000000097 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 152 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +----- ++++++ +conf11 2.4027045398540046 1.7853827712848849 66.47999999999999 1.940000000000012 +1 gpu conv fp16 11 add fp16 12 relu fp16 12 +2 gpu conv samp_fp16 269 add fp16 12 relu fp16 12 pool_max fp16 12 +3 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +4 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +5 gpu conv fp16 12 add fp16 12 relu fp16 12 +6 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 +7 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +8 gpu conv perf_fp16 155 add fp16 12 relu fp16 12 +9 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 +10 gpu conv samp_fp16 262 add fp16 12 relu fp16 12 pool_max fp16 12 +11 gpu conv perf_fp16 160 add fp16 12 relu fp16 12 +12 gpu conv perf_fp16 151 add fp16 12 relu fp16 12 +13 gpu conv samp_fp16 261 add fp16 12 relu fp16 12 pool_max fp16 12 +14 gpu mul fp16 12 add fp16 12 relu fp16 12 +15 gpu mul fp16 12 add fp16 12 +16 gpu softmax fp16 12 +-----