diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu index 78c48d06045fdc62c6ff7681d689f55398f1d05f..ed59139f71043d03591f78c2e5d682e580b6264a 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2.cu @@ -222,9 +222,13 @@ void convToGemmPerfRowHalf(__half * const __restrict__ output, const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) const int w = tx % W_out; //output width index (col number) - int past_start = (h % (x - 1) >= (x - 1 - start)); - const int inH = (h / (x - 1) * x + h % (x-1) + - past_start) * V_stride - V_pad; //input height index (row number) + int h_index; + if(h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + ((h - start + 1) * x) % (x - 1) + start - 1; + } + const int inH = h_index * V_stride - V_pad; const int inW = w * H_stride - H_pad; //input width index (col number) if(n < N) { //is thread id within bounds? for(int i = 0; i < KH; i++) { @@ -264,28 +268,26 @@ void approxInterpolateRowHalf(int N, int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); int n = i / (c * h * w); - int past_start = ((row % x) >= (x - 1 - start)); - if(row == h-1) + if(row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] + = old_data[ch * (b * old_h * w) + n * (old_h * w) + (row * w) + col]; + } else if(row == h-1) { new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; - else if (row == 0) - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - else if(row % x == x - 1 - start){ - int past_startO = ((row - 1) % x) > (x - 1 - start); - int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) + - ((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col; - + old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; + } else if (row == 0) { new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2); + old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + } else if((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = ch * (b * old_h * w) + n * (old_h * w) + (row_index * w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; } - else - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + - ((x-1) * (row / x) + row % x - past_start ) * (w) + col]; - - } } @@ -1293,15 +1295,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - long int h_eff = h - h / row; - if(h % row > row - 1 - offset) - h_eff = h_eff - 1; + int rem_row = (h - offset) % row > 0; + int h_eff = h - ((h - offset) / row) - rem_row; w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - long int w_eff = w - w / col; - if(w % col > col - 1 - offset) - w_eff = w_eff - 1; - + int rem_col = (w - offset) % col > 0; + int w_eff = w - ((w - offset) / col) - rem_col; INFO("input: %d %d %d %d\n", input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3]);