diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu index ed59139f71043d03591f78c2e5d682e580b6264a..4f51e38205315686d391eab113a13f4be5006e10 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu @@ -63,25 +63,29 @@ void convToGemmPerfRow(float * const __restrict__ output, const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) const int w = tx % W_out; //output width index (col number) - int past_start = (h % (x - 1) >= (x - 1 - start)); - const int inH = (h / (x - 1) * x + h % (x-1) + - past_start) * V_stride - V_pad; //input height index (row number) + + int h_index; + if(h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + ((h - start + 1) * x) % (x - 1) + start - 1; + } + const int inH = h_index * V_stride - V_pad; const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w] = 0; + output[out_index] = 0; } } } - } @@ -89,40 +93,88 @@ void convToGemmPerfRow(float * const __restrict__ output, //Interpolates every xth row starting from x - 1 - start //N is total number of elements in final output array __global__ -void approxInterpolateRow(int N, int old_h, int n, int c, int h, int w, +void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w, float *old_data, float *new_data, int x, int start){ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - int past_start = ((row % x) >= (x - 1 - start)); - - if(row == h-1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - else if (row == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - else if(row % x == x - 1 - start){ - int past_startO = ((row - 1) % x) > (x - 1 - start); - int oldIdx1 = n * (c * old_h * w) + ch * (old_h * w) + - ((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col; - - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[oldIdx1] + old_data[oldIdx1 + 1 * w]) / 2; + //int index = blockIdx.x * blockDim.x + threadIdx.x; + // int stride = blockDim.x * gridDim.x; + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (c * h * w); //output image number + const int ch = tx % (c * h * w) / (h * w); //output chan number + const int row = tx % (h * w) / w; //output height index (row number) + const int col = tx % w; //output width index (col number) + + //for(int i = index; i < N; i += stride) { + // int col = ((i % (c * h * w)) % (h * w)) % w; + // int row = ((i % (c * h * w)) % (h * w)) / w; + //int ch = (i % (c * h * w)) / (h * w); + //int n = i / (c * h * w); + + if(row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if(row == h-1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; } - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + - ((x-1) * (row / x) + row % x - past_start ) * (w) + col]; + //} +} - } +//For use in tensorConvPerfCuda +//Interpolates every xth row starting from x - 1 - start +//N is total number of elements in final output array +__global__ +void approxInterpolateRow2(int N, int old_h, int j, int c, int h, int w, + float *old_data, float *new_data, int x, int start){ + + // int index = blockIdx.x * blockDim.x + threadIdx.x; + // int stride = blockDim.x * gridDim.x; + // for(int i = index; i < N; i += stride) { + // int col = ((i % (c * h * w)) % (h * w)) % w; + // int row = ((i % (c * h * w)) % (h * w)) / w; + // int ch = (i % (c * h * w)) / (h * w); + // int n = i / (c * h * w); + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (c * h * w); //output image number + const int ch = tx % (c * h * w) / (h * w); //output chan number + const int row = tx % (h * w) / w; //output height index (row number) + const int col = tx % w; //output width index (col number) + + + if(row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if(row == h-1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; + } + // } } @@ -141,10 +193,14 @@ void convToGemmPerfCol(float * const __restrict__ output, const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) const int w = tx % W_eff; //output width index (col number) - int past_start = (w % (x - 1)) >= (x - 1 - start); + int w_index; + if(w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + ((w - start + 1) * x) % (x - 1) + start - 1; + } + const int inW = w_index * H_stride; const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = (w / (x - 1) * x + w % (x-1) + - past_start) * H_stride - H_pad; //input width index (col number) if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { @@ -167,45 +223,46 @@ void convToGemmPerfCol(float * const __restrict__ output, //Interpolates every xth col starting from x - 1 - start //N is total number of elements in final output array __global__ -void approxInterpolateCol(int N, int old_w, int n, int c, int h, int w, +void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w, float *old_data, float *new_data, int x, int start){ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - int past_start = ((col % x) >= (x - 1 - start)); - - if(col == w-1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - else if (col == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - else if(col % x == x - 1 - start){ - int past_startO = ((col - 1) % x) > (x - 1 - start); - int oldIdx1 = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + - ((x-1) * ((col - 1) / x) + (col-1) % x - past_startO); - - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[oldIdx1] + old_data[oldIdx1 + 1]) / 2; + // int index = blockIdx.x * blockDim.x + threadIdx.x; +// int stride = blockDim.x * gridDim.x; + +// for(int i = index; i < N; i += stride){ + // int col = ((i % (c * h * w)) % (h * w)) % w; + // int row = ((i % (c * h * w)) % (h * w)) / w; + // int ch = (i % (c * h * w)) / (h * w); + // int n = i / (c * h * w); + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (c * h * w); //output image number + const int ch = tx % (c * h * w) / (h * w); //output chan number + const int row = tx % (h * w) / w; //output height index (row number) + const int col = tx % w; //output width index (col number) + + if(col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] + = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if(col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - 1]) / 2; + } else { + int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; } - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + - ((x-1) * (col / x) + col % x - past_start)]; - - } - + // } } - - __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, const __half * const __restrict input, @@ -233,18 +290,17 @@ void convToGemmPerfRowHalf(__half * const __restrict__ output, if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0; - + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0; + } } } } - } @@ -260,36 +316,46 @@ void approxInterpolateRowHalf(int N, int x, int start){ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (c * h * w); //output image number + const int ch = tx % (c * h * w) / (h * w); //output chan number + const int row = tx % (h * w) / w; //output height index (row number) + const int col = tx % w; //output width index (col number) + + // int index = blockIdx.x * blockDim.x + threadIdx.x; + //int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); + // for(int i = index; i < N; i += stride){ + // int col = ((i % (c * h * w)) % (h * w)) % w; + // int row = ((i % (c * h * w)) % (h * w)) / w; + // int ch = (i % (c * h * w)) / (h * w); + // int n = i / (c * h * w); if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[ch * (b * old_h * w) + n * (old_h * w) + (row * w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col]; + //old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if(row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; + //old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + //old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if((row - start) % x == 0) { int row_index = row - ((row + 1 - start) / x); - int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col; + int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } - } - + int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; + } + // } } @@ -312,15 +378,18 @@ void convToGemmPerfColHalf(__half * const __restrict__ output, const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) const int w = tx % W_eff; //output width index (col number) - int past_start = (w % (x - 1)) >= (x - 1 - start); + int w_index; + if(w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + ((w - start + 1) * x) % (x - 1) + start - 1; + } + const int inW = w_index * H_stride - H_pad; const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = (w / (x - 1) * x + w % (x-1) + - past_start) * H_stride - H_pad; //input width index (col number) if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; @@ -342,36 +411,46 @@ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, __half *old_data, __half *new_data, int x, int start){ - int index = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; + // int index = blockIdx.x * blockDim.x + threadIdx.x; + // int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n = i / (c * h * w); - int past_start = ((col % x) >= (x - 1 - start)); - - if(col == w-1) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - else if (col == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; - else if(col % x == x - 1 - start){ - int past_startO = ((col - 1) % x) > (x - 1 - start); - int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + - ((x-1) * ((col - 1) / x) + (col-1) % x - past_startO); - - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2); - } - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + - ((x-1) * (col / x) + col % x - past_start)]; + // for(int i = index; i < N; i += stride){ + // int col = ((i % (c * h * w)) % (h * w)) % w; + // int row = ((i % (c * h * w)) % (h * w)) / w; + // int ch = (i % (c * h * w)) / (h * w); + // int n = i / (c * h * w); - } + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (c * h * w); //output image number + const int ch = tx % (c * h * w) / (h * w); //output chan number + const int row = tx % (h * w) / w; //output height index (row number) + const int col = tx % w; //output width index (col number) + + if(col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col]; + // old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if(col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; + //old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; + //old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col_index; + //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + col_index; + //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; + } + // } } __global__ @@ -544,18 +623,23 @@ __global__ void convToGemmFullInput(float * const __restrict__ output, for(int i = 0; i < KH; i++) { for(int j = 0; j < KW; j++) { const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - - if(filter_elem_num % skip_every != skip_every-1-skip_offset) { - int output_col = filter_elem_num - - ((filter_elem_num + skip_every)/skip_every); - if(skip_every == 1) - output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } + int output_col; + if((filter_elem_num - skip_offset) % skip_every != 0) { + output_col = filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - ((filter_elem_num + 1 - skip_offset) % skip_every > 0); + } else if (filter_elem_num < skip_offset) { + output_col = filter_elem_num; + } else { + continue; + } + const int output_index = h * W_out + w; + const int out_index = n * reduced_filter_elem * W_out * H_out + output_index * reduced_filter_elem + output_col; + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + //output[((output_col*N + n) * H_out + h) * W_out + w] = + output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + //output[((output_col*N + n) * H_out + h) * W_out + w] = 0; + } } } @@ -609,12 +693,22 @@ __global__ void createReducedFiltersFull(float * output, const int fIdx = tx / num_filter_elem; //filter index const int offset = tx % num_filter_elem; //offset within filter if(fIdx < NF) { //is thread id within bounds? + //f * reduced_num_filter_elem + kernel_width * h + w; + if(offset < skip_offset) { + output[fIdx * reduced_filter_elem + offset] = input[num_filter_elem * fIdx + offset]; + } else { + int red_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + ((offset - skip_offset + 1) * skip_every) % (skip_every - 1) + skip_offset - 1; + output[fIdx * reduced_filter_elem + offset] = (skip_every * 1.0 / (skip_every - 1)) * input[num_filter_elem * fIdx + red_index]; + } + /* if(offset % skip_every != skip_every-1-skip_offset) { //are we including this filter element? const int output_row = offset - ((offset + skip_every)/skip_every); //correct for skip_every = 2 output[fIdx*reduced_filter_elem + output_row] = (skip_every * 1.0 / (skip_every - 1)) * input[tx]; } + */ } } @@ -672,15 +766,12 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - long int h_eff = h - h / row; - if(h % row > row - 1 - start) - h_eff = h_eff - 1; - + int rem_row = (h - start) % row > 0; + int h_eff = h - ((h - start) / row) - rem_row; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - long int w_eff = w - w / col; - if(w % col > col - 1 - start) - w_eff = w_eff - 1; - + int rem_col = (w - start) % col > 0; + int w_eff = w - ((w - start) / col) - rem_col; Tensor* new_output; if(row > 1){ @@ -855,9 +946,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, } - - - /*** NOTE: tensorConvApprox is the FP32 Baseline routine ***/ @@ -893,15 +981,12 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - long int h_eff = h - h / row; - if(h % row > row - 1 - offset) - h_eff = h_eff - 1; - + int rem_row = (h - offset) % row > 0; + int h_eff = h - ((h - offset) / row) - rem_row; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - long int w_eff = w - w / col; - if(w % col > col - 1 - offset) - w_eff = w_eff - 1; - + int rem_col = (w - offset) % col > 0; + int w_eff = w - ((w - offset) / col) - rem_col; Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -936,17 +1021,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, - n)); + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h_eff * w, c, num_filter_elem, + &alpha, + convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, + &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, + n)); + new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -954,8 +1040,9 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, changeTensorPlacement(new_output, DEVICE); //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, + int blocksize = 128; + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, (float *) output->gpu_data, (float *) new_output->gpu_data, row, offset); @@ -965,7 +1052,6 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, cudaFree(convData); } else if(col > 1){ - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); @@ -1043,7 +1129,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, float* reducedFilter; checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); const int filtBlockSize = 128; - const int filtGridSize = (c * num_filter_elem + filtBlockSize - 1) / filtBlockSize; + const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; if(offset != skip_every) createReducedFiltersFull<<<filtGridSize, filtBlockSize>>>(reducedFilter, (float *)filter->gpu_data, @@ -1055,7 +1141,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, const int blockSize = 128; const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; if(skip_every == 2){ - convToGemmFullInput2<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, + convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], @@ -1104,7 +1190,7 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, int numBlocks = (n * c * h * w + 255) / 256; switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, + (float *)output->gpu_data, (float *)new_output->gpu_data); checkCudaErrors(cudaDeviceSynchronize()); @@ -1368,7 +1454,7 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, freeTensor(output_half); cudaFree(convData); - } +} else if(col > 1){ Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff);