diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2fc2c9493453f55cb83094373b19a24b59135d4 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt @@ -0,0 +1,6 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..063ba473d6a7fa57d7572c86dde9beac0932163d --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt @@ -0,0 +1,7 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..84103fc103c4e1e4e7cdeec40bc61e2ea4852052 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt @@ -0,0 +1,14 @@ +fp32,11 -1 1.0 tensorConvolution tensorConvolution +fp16,12 -1 1.5 tensorConvolution tensorHalfConvolution +perf,21 1,2,0 2.25 tensorConvolution tensorConvPerfCuda +perf,22 1,2,1 2.25 tensorConvolution tensorConvPerfCuda +perf,23 1,3,0 1.88 tensorConvolution tensorConvPerfCuda +perf,24 1,3,1 1.88 tensorConvolution tensorConvPerfCuda +perf,25 2,1,0 2.25 tensorConvolution tensorConvPerfCuda +perf,26 2,1,1 2.25 tensorConvolution tensorConvPerfCuda +perf,27 3,1,0 1.88 tensorConvolution tensorConvPerfCuda +perf,28 3,1,1 1.88 tensorConvolution tensorConvPerfCuda +samp,31 2,0 2.25 tensorConvolution tensorConvSampSim +samp,32 2,1 2.25 tensorConvolution tensorConvSampSim +samp,33 4,0 1.8 tensorConvolution tensorConvSampSim +samp,34 4,1 1.8 tensorConvolution tensorConvSampSim diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..be1ce58c95981535ec94a7f8badffe967cfed586 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt @@ -0,0 +1,4 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..6719acb97a58bd7f3d9fbe428f755e13df98b3d0 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..719d96e48168a477d6edfee1a02b80b554612ec7 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt @@ -0,0 +1,8 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7ff033cec2b85390ce6c7667fbbb04837a7eaf9 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt @@ -0,0 +1,22 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt @@ -0,0 +1,15 @@ +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12,21,22,23,24,25,26,27,28,31,32,33,34 +11,12 +11,12 diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h index d5a1e903f644c4d27477bac4d8587fb177b58021..66070f3058d840e4dbe25919e33aa8abc060b330 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h @@ -16,8 +16,7 @@ #include "op_overheads.h" #include "half_precision_api.h" #include "approx_techniques2.h" - - +#include <unordered_map> @@ -240,8 +239,6 @@ void sampleFilterElems(int N, //int local_index = row * w + col; int local_index = (ch * (h * w)) + (row * w) + col; - - //data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 1.0; if(local_index % skip_elem == skip_offset) data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; @@ -306,6 +303,10 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); + convertToFP32(input); + convertToFP32(filter); + + // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling sampleFilter(filter, skip_rate, skip_offset); @@ -344,7 +345,7 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); Tensor* output; - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -608,26 +609,58 @@ void* PROMISE_FC(void* input, float i_min, float i_max, // NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper //#define OLD_MODEL - #ifndef OLD_MODEL -/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */ -void* ConvLayer_PROMISE(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ +bool isPromise(int swing){ + if(swing < 8) + return true; + else + return false; +} + + +bool isFullPrecision(int swing){ + + if(swing == 11) + return true; + else + return false; +} + + + +bool isHalfPrecision(int swing){ + + if(swing == 12) + return true; + else + return false; +} + + +bool isPerforation(int swing){ + + if(swing >= 21 && swing <= 29) + return true; + else + return false; +} + + +bool isSampling(int swing){ + + if(swing >= 31 && swing <= 39) + return true; + else + return false; +} + + +int getSwing(int swing){ - if(ONLINE_PROFILING){ - ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n"); - } - #ifdef PROMISE_TUNER_ENABLED // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime @@ -640,14 +673,142 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, swing = op_accuracies[op_counter]; op_counter++; } - + #endif + DEBUG("---- swing_value = %d \n", swing); + return swing; +} + + + + + + +class PerfParams{ + + public: + int row; + int col; + int skip_offset; + + PerfParams(){ + row = 1; + col = 1; + skip_offset = 0; + } - DEBUG("---- swing_value = %d \n", swing); + PerfParams(int row1, int col1, int skip_offset1){ + row = row1; + col = col1; + skip_offset = skip_offset1; + } + +}; - if(swing < 8){ + + +PerfParams getPerfParams(int swing){ + + std::map<int, PerfParams> perf_knob_map; + + PerfParams params21(1, 2, 0); + perf_knob_map[21] = params21; + + PerfParams params22(1, 2, 1); + perf_knob_map[22] = params22; + + PerfParams params23(1, 3, 0); + perf_knob_map[23] = params23; + + PerfParams params24(1, 3, 1); + perf_knob_map[24] = params24; + + PerfParams params25(2, 1, 0); + perf_knob_map[25] = params25; + + PerfParams params26(2, 1, 1); + perf_knob_map[26] = params26; + + PerfParams params27(3, 1, 0); + perf_knob_map[27] = params27; + + PerfParams params28(3, 1, 1); + perf_knob_map[28] = params28; + + + return perf_knob_map[swing]; + +} + + + + +class SampParams{ + + public: + int skip_rate; + int skip_offset; + + SampParams(){ + skip_rate = 1; + skip_offset = 0; + } + + SampParams(int skip_rate1, int skip_offset1){ + skip_rate = skip_rate1; + skip_offset = skip_offset1; + } + +}; + + + +SampParams getSampParams(int swing){ + + std::map<int, SampParams> samp_knob_map; + + SampParams params31(2, 0); + samp_knob_map[31] = params31; + + SampParams params32(2, 1); + samp_knob_map[32] = params32; + + SampParams params33(4, 0); + samp_knob_map[33] = params33; + + SampParams params34(4, 1); + samp_knob_map[34] = params34; + + return samp_knob_map[swing]; + +} + + + + + + +/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */ + +void* ConvLayer_PROMISE(void* input, float i_min, float i_max, + void* filter, float w_min, float w_max, + void* bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing){ + + if(ONLINE_PROFILING){ + ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n"); + } + + + swing = getSwing(swing); + + if(isPromise(swing)){ return PROMISE_Conv(input, i_min, i_max, filter, w_min, w_max, @@ -662,47 +823,52 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, void* conv_out; - if(swing >= 21 && swing <= 29){ + if(isPerforation(swing)){ - int rows = 2; - int cols = 1; + PerfParams params = getPerfParams(swing); + DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n", + params.row, params.col, params.skip_offset); - switch(swing){ - - // Check if col/row == 1 in baseline? - case 21: rows = 2; cols = 1; break; - case 22: rows = 2; cols = 1; break; - case 23: rows = 1; cols = 2; break; - case 24: rows = 1; cols = 2; break; - //case 25: rows = 2; cols = 2; break; - //case 26: rows = 2; cols = 2; break; - //default: rows = 2; cols = 2; break; - } + conv_out = tensorConvPerfCuda(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.row, params.col, params.skip_offset); + } - // FIXIT: See if start is from 0 or 1? - int start = (swing % 2) + 1; - conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - rows, cols, start); + if(isSampling(swing)){ + + SampParams params = getSampParams(swing); + DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", + params.skip_rate, params.skip_offset); + + conv_out = tensorConvSampSim(input, filter, + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.skip_rate, params.skip_offset); } - else if(swing == 12){ + + + if (isHalfPrecision(swing)){ + conv_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 0); } - else{ + + if (isFullPrecision(swing)){ conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, 0); } + void* conv_add; if(bias != NULL){ - if(swing >= 12){ + if( !isFullPrecision(swing) ){ conv_add = tensorHalfAdd(conv_out, bias); } else{ @@ -714,10 +880,9 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, } void* pool_out; - // NOTE: Skip pooling on negative pool sizes if(pool_size > 0){ //FIXME: Currently only using MaxPooling - pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, + pool_out = tensorHalfPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); } else{ @@ -731,13 +896,13 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, INFO("NO Activation Function \n"); break; case 0: - activation_out = tensorTanh(pool_out); + activation_out = tensorHalfTanh(pool_out); break; case 1: - activation_out = tensorRelu(pool_out); + activation_out = tensorHalfRelu(pool_out); break; case 2: - activation_out = tensorRelu2(pool_out, out_min, out_max); + activation_out = tensorHalfRelu2(pool_out, out_min, out_max); break; default: ERROR("Activation id %d NOT supported \n", activation_out); @@ -756,25 +921,9 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu' + swing = getSwing(swing); - #ifdef PROMISE_TUNER_ENABLED - - // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime - if(!approxhpvm_runtime_mode){ - - if(op_counter >= total_ops){ - ERROR("No accuracy flag found \n"); - } - - swing = op_accuracies[op_counter]; - op_counter++; - } - - #endif - - - - if(swing < 8){ + if(isPromise(swing)){ return PROMISE_FC(input, i_min, i_max, weights, w_min, w_max, @@ -786,7 +935,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, void* gemm_out; - if(swing >= 12){ + if(!isFullPrecision(swing)){ gemm_out = tensorHalfGemm(input, weights); } else{ @@ -797,7 +946,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, void* gemmbias_out; if(bias != NULL){ // Swing 8 corresponds to FP32 - if(swing >= 12){ + if(!isFullPrecision(swing)){ gemmbias_out = tensorHalfAdd(gemm_out, bias); } else{ @@ -816,13 +965,13 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, INFO("No Activation Function \n"); break; case 0: - activation_out = tensorTanh(gemmbias_out); + activation_out = tensorHalfTanh(gemmbias_out); break; case 1: - activation_out = tensorRelu(gemmbias_out); + activation_out = tensorHalfRelu(gemmbias_out); break; case 2: - activation_out = tensorRelu2(gemmbias_out, out_min, out_max); + activation_out = tensorHalfRelu2(gemmbias_out, out_min, out_max); break; default: ERROR("Activation id %d NOT supported \n", activation_out); @@ -840,8 +989,20 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, #ifdef OLD_MODEL +#endif + +#endif + + + /************* NOTE: Outdated PROMISE routines - Used for Comparison ****/ + + + +/* + + void* ConvLayer_PROMISE(void* input, float i_min, float i_max, void* filter, float w_min, float w_max, @@ -916,6 +1077,7 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); */ +/*------------- } else if(swing == 9 || (swing >= 16 && swing <= 19) ){ //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w, @@ -948,6 +1110,8 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max, DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); */ +/*----- + } else if(swing == 10){ conv_out = tensorHalfConvolution(input, filter, @@ -1146,3 +1310,21 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, + + /*#ifdef PROMISE_TUNER_ENABLED + + // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime + if(!approxhpvm_runtime_mode){ + + if(op_counter >= total_ops){ + ERROR("No accuracy flag found \n"); + } + + swing = op_accuracies[op_counter]; + op_counter++; + } + + #endif + + */ + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h index af0ed1e202017dde2cb96e9f8798aff1219c0695..9689c6fce91d3a4093d91b5006ef1beee969f8eb 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h @@ -819,7 +819,7 @@ __global__ void depthwise_conv4_half3(__half* const __restrict__ y, __half t1; - int total = C_dim * H_dim * W_dim; + //int total = C_dim * H_dim * W_dim; t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim + start_w + q - bstart_w]; @@ -920,7 +920,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups){ - llvm_hpvm_initTensorRt(0); INFO("*** TensorConvolution \n"); profileEvent("Conv"); @@ -935,7 +934,13 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, Tensor* output; - + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP32(input); + convertToFP32(filter); + + if (conv_groups > 32) { // TODO: Support other cases; hostToDeviceCopy(input); @@ -949,7 +954,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device @@ -957,33 +962,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, // NOTE: Necessary to insert the above call for every output tensor - /* - if (c > 255) { - dim3 grid((n / 16), c); - dim3 block(h * w); - depthwise_conv << <grid, block >> > ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - - }*/ - - /* - dim3 grid((n / 12), c); - dim3 block(h * w); - depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - if(n % 12 > 0){ - dim3 grid2((n % 12), c); - dim3 block(h * w); - depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); - } - */ int blockSize; blockSize = 64; @@ -994,7 +972,8 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, (float*)input->gpu_data, (float*)filter->gpu_data, input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); } else { @@ -1043,11 +1022,11 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NHWC, n, h, w, c); } else @@ -1137,6 +1116,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, } +// FIXME: Need to properly fix the new HALF type conversion void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, @@ -1165,6 +1145,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); + convertToFP16(input); + convertToFP16(filter); + /***** CONVERSIONS from FP32 to FP16 - on the GPU */ size_t* input_dims = input->dims.dim_sizes; @@ -1209,7 +1192,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // FIXIT: more checks for data types needed output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF, @@ -1797,7 +1780,7 @@ void* tensorConvPerf(void* input_ptr, void* filter_ptr, Tensor* new_output; if(input->data_format == CUDNN_TENSOR_NCHW) - new_output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); else if(input->data_format == CUDNN_TENSOR_NHWC){ DEBUG("* NHWC Format \n"); @@ -2078,3 +2061,32 @@ void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr, #endif return output; } + + + /* + if (c > 255) { + dim3 grid((n / 16), c); + dim3 block(h * w); + depthwise_conv << <grid, block >> > ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + + }*/ + + /* + dim3 grid((n / 12), c); + dim3 block(h * w); + depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + if(n % 12 > 0){ + dim3 grid2((n % 12), c); + dim3 block(h * w); + depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); + } + */ diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index f66b80d7287611c44151eb5fa02b4fed3e7d4e7b..a81ffe296233178126555bbb53babdcd4192a7bf 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -1,4 +1,6 @@ +#include "tensor_utils.cu" + //This skips every xth row @@ -173,10 +175,17 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, if (conv_groups == 0) { conv_groups = 1; } + Tensor* output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); + + + convertToFP32(input); + convertToFP32(filter); + + int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; c = filter->dims.dim_sizes[0]; //number of filters @@ -196,7 +205,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, Tensor *new_output; if(row > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device @@ -231,7 +240,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); - new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); @@ -239,7 +248,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, //interpolate int numBlocks = (n * c * h * w + 127) / 128; approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *)output->gpu_data, (float *)new_output->gpu_data, + (float *) output->gpu_data, (float *) new_output->gpu_data, row, start); cudaDeviceSynchronize(); @@ -247,7 +256,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, cudaFree(convData); } else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device @@ -282,7 +292,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); - new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); @@ -298,7 +308,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, cudaFree(convData); } else{ - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device @@ -336,19 +346,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, profileEvent("Conv_end", true); - #ifdef ERROR_INJECTION_ENABLED - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } - int op_acc = op_accuracies[op_counter]; - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - op_counter++; - #endif + + return new_output; }