diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu index 8f08ad9e263db5060373bb7450c185966dbea31f..1b770736bab93dd6a47cb4351dd0ad054e8eb14d 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu @@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element if((filter_elem_num - skip_offset) % skip_every) { const int condition = (filter_elem_num < skip_offset); const int output_col = condition * filter_elem_num + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; + - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; //((output_col*N + n) * H_out + h) * W_out + w; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) @@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element if((filter_elem_num - skip_offset) % skip_every) { const int condition = (filter_elem_num < skip_offset); const int output_col = condition * filter_elem_num + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; + const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else @@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output, } const int inH = h_index * V_stride - V_pad; const int inW = w * H_stride - H_pad; //input width index (col number) - //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; + for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; @@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output, } const int inW = w_index * H_stride - H_pad; const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - + for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element @@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, } const int inH = h_index * V_stride - V_pad; const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - + + for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element @@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, } const int inH = h_index * V_stride - V_pad; const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } + + + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element + const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + + } + } + } } __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, __half *old_data, __half *new_data, int x, int start) { - //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image number - //const int stride = blockDim.x * gridDim.x; + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int n = tx / (c * h * w); //output image number if(n < N) { - //for(int i = index; i < N; i += stride){ - //const int col = ((i % (c * h * w)) % (h * w)) % w; - //const int row = ((i % (c * h * w)) % (h * w)) / w; - //const int ch = (i % (c * h * w)) / (h * w); - // const int n = i / (c * h * w); const int ch = tx % (c * h * w) / (h * w); //filter number const int row = tx % (h * w) / w; //output height index (row number) @@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w, __half *old_data, __half *new_data, int x, int start) { - //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image numbe - //const int stride = blockDim.x * gridDim.x; const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int n = tx / (c * h * w); //output image number if(n < N) { - //for(int i = index; i < N; i += stride){ - //const int col = ((i % (c * h * w)) % (h * w)) % w; - //const int row = ((i % (c * h * w)) % (h * w)) / w; - //const int ch = (i % (c * h * w)) / (h * w); - //const int n = i / (c * h * w); const int ch = tx % (c * h * w) / (h * w); //filter number const int row = tx % (h * w) / w; //output height index (row number) @@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, } else if((row - start) % x == 0) { const int row_index = row - ((row + 1 - start) / x); const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); } else { const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; } } @@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output, } const int inW = w_index * H_stride - H_pad; const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - + for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element @@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, } const int inW = w_index * H_stride - H_pad; const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; + + for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen @@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, __half *old_data, __half *new_data, int x, int start) { - //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int stride = blockDim.x * gridDim.x; - - //for(int i = index; i < N; i += stride){ - // const int col = ((i % (c * h * w)) % (h * w)) % w; - // const int row = ((i % (c * h * w)) % (h * w)) / w; - // const int ch = (i % (c * h * w)) / (h * w); - // const int n = i / (c * h * w); - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int n = tx / (c * h * w); //output image number if(n < N) { @@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, __half *old_data, __half *new_data, int x, int start) { - //const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int stride = blockDim.x * gridDim.x; - - // for(int i = index; i < N; i += stride){ - // const int col = ((i % (c * h * w)) % (h * w)) % w; - // const int row = ((i % (c * h * w)) % (h * w)) / w; - // const int ch = (i % (c * h * w)) / (h * w); - // const int n = i / (c * h * w); const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int n = tx / (c * h * w); //output image number if(n < N) { @@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, if(col < start) { new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if(col == w - 1) { new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + } else if (col == 0) { new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if((col - start) % x == 0) { const int col_index = col - ((col + 1 - start) / x); const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); } else { const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; } } @@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output, in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; } + const int i = (in_index % (KW * KH)) / KW; const int j = in_index % KW; const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; @@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output, } } } + + } __global__ void createReducedFiltersFullRegular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { + const float * const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, + const int skip_every, const int skip_offset, const float fac) { const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int fIdx = tx / reduced_filter_elem; //filter index @@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output, int in_index; if(offset < channel_offset) { in_index = offset; - } else { + } + else { in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; + } + + output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; } } @@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, const int inH = h * V_stride - V_pad; //input height index (row number) const int inW = w * H_stride - H_pad; //input width index (col number) - #pragma unroll - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - //const int ch = (fi * C) / reduced_filter_elem; + #pragma unroll for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - //int in_index; + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + const bool condition = (fi < offset); const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { + + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { output[out_index] = 0; - } + } } } } @@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, #pragma unroll for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - // const int ch = (fi * C) / reduced_filter_elem; + + const int fi = ch * (reduced_filter_elem / C) + ki; const int offset = (skip_offset + ch) % skip_every; const int condition = (fi < offset); - const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) + const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - // int in_index; - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } + const int i = (in_index % (KW * KH)) / KW; const int j = in_index % KW; const int out_index = ((fi * N + n) * H_out + h) * W_out + w; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { + } + else { output[out_index] = 0; } } @@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, const int condition = (fi < skip_offset); const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //int in_index; - //if(fi < skip_offset) { - // in_index = fi; - //} else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); + + const int ch = in_index / (KW * KH); const int i = (in_index % (KW * KH)) / KW; const int j = in_index % KW; const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { + } + else { output[out_index] = 0; } } @@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output const int condition = (fi < skip_offset); const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - // int in_index; - // if(fi < skip_offset) { - // in_index = fi; - // } else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } + const int ch = in_index / (KW * KH); const int i = (in_index % (KW * KH)) / KW; const int j = in_index % KW; const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; } else { @@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output, const int skip_every, const int skip_offset, const float fac) { const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int stride = blockDim.x * gridDim.x; - - //#pragma unroll - //for (int i = tx; i < NF; i += stride) { - const int fIdx = tx / reduced_filter_elem; //filter index + + const int fIdx = tx / reduced_filter_elem; //filter index if(fIdx < NF) { const int offset = tx % reduced_filter_elem; //offset within filter const int ch = (offset * channels) / reduced_filter_elem; @@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output, const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); - // int in_index; - // if(offset < channel_offset) { - // in_index = offset; - //} else { - // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - // + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - // } output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); } + } __global__ void createReducedFiltersHalfIrregular(__half * output, @@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output, const int num_filter_elem, const int reduced_filter_elem, const int skip_every, const int skip_offset, const float fac) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int stride = blockDim.x * gridDim.x; - //#pragma unroll - //for (int i = tx; i < NF; i += stride) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int fIdx = tx / reduced_filter_elem; //filter index - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter - const int condition = (offset < skip_offset); - int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + if(fIdx < NF) { + + const int offset = tx % reduced_filter_elem; //offset within filter + const int condition = (offset < skip_offset); + + int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //} - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - //} + + output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); } + } @@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output, for(int j = 0; j < KW; j++) { const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? - const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account + const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else @@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, int horizontal_stride, int conv_mode, int conv_groups, int row, int col, int start){ - //////INFO("*** TensorConvolution (output perforation) \n"); - //Event("Conv"); Tensor* input = (Tensor*)input_ptr; Tensor* filter = (Tensor*)filter_ptr; //FIXME: Current hack to preserve backward compatibilty @@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - //Event("H2F_start"); convertToFP32(input); convertToFP32(filter); - //Event("H2F_end"); long int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; @@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, freeTensor(output); cudaFree(convData); - } else if(col > 1){ + } + else if(col > 1){ output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; float * convData; @@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, cudaFree(convData); cudaFree(reducedFilter); } else { - INFO("FP32 BASELINE\n"); + + //INFO("FP32 BASELINE\n"); Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(output, DEVICE); @@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, freeTensor(output); cudaFree(convData); } -// INFO("CONV DONE\n"); + profileEvent("H2F_start"); convertToFP32_offline(new_output); - //convertToFP32(input); - //convertToFP32(filter); + profileEvent("H2F_end"); - //profileEvent("#Conv_end"); - //INFO("CONVOLUTION END\n"); + return new_output; }