diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu deleted file mode 100644 index bdcfb2c5684d1584e1a520194066fc20e3724632..0000000000000000000000000000000000000000 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu +++ /dev/null @@ -1,2049 +0,0 @@ - -#include "tensor_utils.h" -#include "approx_utils.h" -#include "debug.h" -#include "global_data.h" -#include "fp16_gemm.h" -#include "fp16_conversion.h" -#include "profiling.h" - -extern "C" { - -__global__ void convToGemm(float *const __restrict__ output, - const float *const __restrict input, const int N, - const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, - const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - if (n < N) { - const int c = - tx % (C * H_out * W_out) / (H_out * W_out); // output chan number - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - const int out_index = - ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmFullInput( - float *const __restrict__ output, const float *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan - // number - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number)_ - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter elemen - if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) { - int output_col = - filter_elem_num - ((filter_elem_num + skip_every) / skip_every); - if (skip_every == 1) - output_col = filter_elem_num; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col * N + n) * H_out + h) * W_out + w] = 0; - } - } - } - } -} - -__global__ void -convToGemmHalfInputNew(__half *const __restrict__ output, - const __half *const __restrict input, const int N, - const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, - const int H_out, const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan - // number - const int h = tx % (H_out * W_out) / W_out; // output height index (row - // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - if (filter_elem_num % skip_every != skip_offset) { - int output_col = - filter_elem_num - (filter_elem_num / skip_every + - (filter_elem_num % skip_every > skip_offset)); - if (skip_every == 1) - output_col = filter_elem_num; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col * N + n) * H_out + h) * W_out + w] = 0; - } - } - } - } -} - -__global__ void convToGemmHalf(__half *const __restrict__ output, - const __half *const __restrict input, - const int N, const int C, const int H, - const int W, const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i - const int n = tx / (C * H_out * W_out); // output image numbe - const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe - const int h = tx % (H_out * W_out) / W_out; // output height index (row number - const int w = tx % W_out; // output width index (col number - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - } else { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; - } - } - } - } -} - -__global__ void convToGemmHalfInputNewIrregular( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan - // number - const int h = tx % (H_out * W_out) / W_out; // output height index (row - // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - // const int ki = c * KH * KW + i; - // const int kj = c * KH * KW + j; - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - if ((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = - condition * filter_elem_num + - (!condition) * - (filter_elem_num - - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - // if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > - // skip_offset)); - // if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = - ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } - } -} - -__global__ void convToGemmHalfInputNewIrregular2( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan - // number - const int h = tx % (H_out * W_out) / W_out; // output height index (row - // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - // const int ki = c * KH * KW + i; - // const int kj = c * KH * KW + j; - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - if ((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = - condition * filter_elem_num + - (!condition) * - (filter_elem_num - - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - // if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > - // skip_offset)); - // if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } - } -} - -__global__ void convToGemmHalf2(__half *const __restrict__ output, - const __half *const __restrict input, - const int N, const int C, const int H, - const int W, const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride, - const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - if (n < N) { - const int c = - tx % (C * H_out * W_out) / (H_out * W_out); // output chan number - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - (c * KH + i) * KW + j; // index of this filter element - const int out_index = - ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void -convToGemmPerfRow(float *const __restrict__ output, - const float *const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, - const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int x, const int start, const int H_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_eff * W_out); // output image number - if (n < N) { - const int c = - tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number - const int h = - tx % (H_eff * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - int h_index; - if (h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + - (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; // input width index (col number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter element - const int out_index = - ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, - int w, float *old_data, float *new_data, - int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (c * h * w); // output image number - if (n < N) { - const int ch = tx % (c * h * w) / (h * w); // filter number - const int row = tx % (h * w) / w; // output height index (row number) - const int col = tx % w; // output width index (col number) - - if (row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if (row == h - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + - col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if ((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = - n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - w]) / 2; - } else { - int row_index = - row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = - n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void -convToGemmPerfCol(float *const __restrict__ output, - const float *const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, - const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int x, const int start, const int W_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_eff); // output image number - if (n < N) { - const int c = - tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number - const int h = - tx % (H_out * W_eff) / W_eff; // output height index (row number) - const int w = tx % W_eff; // output width index (col number) - int w_index; - if (w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + - (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; // input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter element - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + - w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + - w] = 0; - } - } - } -} - -__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, - int w, float *old_data, float *new_data, - int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (c * h * w); // output image number - if (n < N) { - const int ch = tx % (c * h * w) / (h * w); // output chan number - const int row = tx % (h * w) / w; // output height index (row number) - const int col = tx % w; // output width index (col number) - - if (col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if (col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + - old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if ((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = - n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - 1]) / 2; - } else { - int col_index = - col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = - n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void convToGemmPerfRowHalf( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, const int x, - const int start, const int H_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_eff * W_out); // output image number - if (n < N) { - const int c = - tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number - const int h = - tx % (H_eff * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - int h_index; - if (h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + - (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; // input width index (col number) - // #pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter element - const int out_index = - ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmPerfRowHalf2( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, const int x, - const int start, const int H_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_eff * W_out); // output image numbe - if (n < N) { - const int c = - tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number - const int h = - tx % (H_eff * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - int h_index; - if (h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + - (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; // input width index (col number) - // #pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter element - const int out_index = - ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, - int w, __half *old_data, - __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id - // const int n = tx / (c * h * w); //output image number - const int stride = blockDim.x * gridDim.x; - // if(n < N) { - for (int i = index; i < N; i += stride) { - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - // const int ch = tx % (c * h * w) / (h * w); //filter number - // const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number) - - if (row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if (row == h - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + - col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if ((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = - n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - int row_index = - row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = - n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, - int w, __half *old_data, - __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id - // const int n = tx / (c * h * w); //output image numbe - const int stride = blockDim.x * gridDim.x; - // if(n < N) { - for (int i = index; i < N; i += stride) { - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - // const int ch = tx % (c * h * w) / (h * w); //filter number - // const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number - if (row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col]; - } else if (row == h - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + - col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if ((row - start) % x == 0) { - const int row_index = row - ((row + 1 - start) / x); - const int output_index = - ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - const int row_index = - row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - const int output_index = - ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void convToGemmPerfColHalf( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, const int x, - const int start, const int W_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_eff); // output image number - if (n < N) { - const int c = - tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number - const int h = - tx % (H_out * W_eff) / W_eff; // output height index (row number) - const int w = tx % W_eff; // output width index (col number) - int w_index; - if (w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + - (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; // input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter element - - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + - w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + - w] = 0; - } - } - } -} - -__global__ void convToGemmPerfColHalf2( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, const int x, - const int start, const int W_eff) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_eff); // output image number - if (n < N) { - const int c = - tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number - const int h = - tx % (H_out * W_eff) / W_eff; // output height index (row number) - const int w = tx % W_eff; // output width index (col number) - int w_index; - if (w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + - (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; // input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for (int i = 0; i < KH; i++) { - for (int j = 0; j < KW; j++) { - const int filter_elem_num = - c * KH * KW + i * KW + j; // index of this filter elemen - const int out_index = - ((filter_elem_num * N + n) * H_out + h) * W_eff + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } - } -} - -__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, - int w, __half *old_data, - __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int stride = blockDim.x * gridDim.x; - - for (int i = index; i < N; i += stride) { - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - // const int n = tx / (c * h * w); //output image number - // if(n < N) { - // const int ch = tx % (c * h * w) / (h * w); //output chan number - // const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number) - - if (col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if (col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + - old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if ((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = - n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - int col_index = - col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = - n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, - int w, __half *old_data, - __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int stride = blockDim.x * gridDim.x; - - for (int i = index; i < N; i += stride) { - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - // const int n = tx / (c * h * w); //output image number - // if(n < N) { - // const int ch = tx % (c * h * w) / (h * w); //output chan number - // const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number) - if (col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col]; - // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if (col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + - old_w - 1]; - // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)]; - // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if ((col - start) % x == 0) { - const int col_index = col - ((col + 1 - start) / x); - const int output_index = - ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - const int col_index = - col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - const int output_index = - ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - // const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * - // old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[output_index]; - } - } -} - -__global__ void -convToGemmFullInputRegular(float *const __restrict__ output, - const float *const __restrict input, const int N, - const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, - const int H_out, const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (H_out * W_out); // output image number - if (n < N) { - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - -#pragma unroll - for (int fi = 0; fi < reduced_filter_elem; fi++) { - const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - int in_index; - if (fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + - (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1; - } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = - ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmFullInputIrregular( - float *const __restrict__ output, const float *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (H_out * W_out); // output image number - if (n < N) { - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - -#pragma unroll - for (int fi = 0; fi < reduced_filter_elem; fi++) { - int in_index; - if (fi < skip_offset) { - in_index = fi; - } else { - in_index = - ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + - (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + - skip_offset - 1; - } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = - ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void createReducedFiltersFullRegular( - float *output, const float *const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, const int skip_every, const int skip_offset, - const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int fIdx = tx / reduced_filter_elem; // filter index - if (fIdx < NF) { - const int offset = tx % reduced_filter_elem; // offset within filter - const int ch = (offset * channels) / reduced_filter_elem; - const int channel_offset = (skip_offset + ch) % skip_every; - int in_index; - if (offset < channel_offset) { - in_index = offset; - } else { - in_index = - ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + - (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > - 0) + - channel_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = - fac * input[num_filter_elem * fIdx + in_index]; - } -} - -__global__ void createReducedFiltersFullIrregular( - float *output, const float *const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int fIdx = tx / reduced_filter_elem; // filter index - if (fIdx < NF) { - const int offset = tx % reduced_filter_elem; // offset within filter - int in_index; - if (offset < skip_offset) { - in_index = offset; - } else { - in_index = - ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + - (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + - skip_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = - fac * input[num_filter_elem * fIdx + in_index]; - } -} - -__global__ void -convToGemmHalfInputRegular(__half *const __restrict__ output, - const __half *const __restrict input, const int N, - const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, - const int H_out, const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - if (n < N) { - const int ch = - tx % (C * H_out * W_out) / (H_out * W_out); // output chan number - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - -#pragma unroll - // for(int fi = 0; fi < reduced_filter_elem; fi++) { - // const int ch = (fi * C) / reduced_filter_elem; - for (int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - // int in_index; - const bool condition = (fi < offset); - const int in_index = - condition * fi + - (!condition) * - (((fi - offset + 1) * skip_every) / (skip_every - 1) + - (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1); - // if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) - // + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = - ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmHalfInputRegular2( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (C * H_out * W_out); // output image number - if (n < N) { - const int ch = - tx % (C * H_out * W_out) / (H_out * W_out); // output chan number - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - -#pragma unroll - for (int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - // for(int fi = 0; fi < reduced_filter_elem; fi++) { - // const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - const int condition = (fi < offset); - const int in_index = - condition * fi + - (!condition) * - (((fi - offset + 1) * skip_every) / (skip_every - 1) + - (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + - offset - 1); - // int in_index; - // if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > - // 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmHalfInputIrregular( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (H_out * W_out); // output image number - if (n < N) { - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - -#pragma unroll - for (int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = - condition * fi + - (!condition) * - (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + - (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + - skip_offset - 1); - // int in_index; - // if(fi < skip_offset) { - // in_index = fi; - //} else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - - // 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = - ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void convToGemmHalfInputIrregular2( - __half *const __restrict__ output, const __half *const __restrict input, - const int N, const int C, const int H, const int W, const int KH, - const int KW, const int V_pad, const int H_pad, const int H_out, - const int W_out, const int V_stride, const int H_stride, - const int reduced_filter_elem, const int skip_every, - const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int n = tx / (H_out * W_out); // output image number - if (n < N) { - const int h = - tx % (H_out * W_out) / W_out; // output height index (row number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) -#pragma unroll - for (int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = - condition * fi + - (!condition) * - (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + - (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + - skip_offset - 1); - // int in_index; - // if(fi < skip_offset) { - // in_index = fi; - // } else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) - // > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - // const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * - // W_out + w; - if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = - input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } - } -} - -__global__ void createReducedFiltersHalfRegular( - __half *output, const __half *const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, const int skip_every, const int skip_offset, - const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int stride = blockDim.x * gridDim.x; - - //#pragma unroll - for (int i = tx; i < NF; i += stride) { - const int fIdx = i / reduced_filter_elem; // filter index - // if(fIdx < NF) { - const int offset = i % reduced_filter_elem; // offset within filter - const int ch = (offset * channels) / reduced_filter_elem; - const int channel_offset = (skip_offset + ch) % skip_every; - const int condition = (offset < channel_offset); - const int in_index = - condition * offset + - (!condition) * - (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + - (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > - 0) + - channel_offset - 1); - - // int in_index; - // if(offset < channel_offset) { - // in_index = offset; - //} else { - // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - - // 1) - // + (((offset - channel_offset + 1) * skip_every) % (skip_every - - // 1) > 0) + channel_offset -1; - // } - output[fIdx * reduced_filter_elem + offset] = - __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - } -} - -__global__ void createReducedFiltersHalfIrregular( - __half *output, const __half *const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int stride = blockDim.x * gridDim.x; - //#pragma unroll - for (int i = tx; i < NF; i += stride) { - - const int fIdx = i / reduced_filter_elem; // filter index - // if(fIdx < NF) { - const int offset = i % reduced_filter_elem; // offset within filter - const int condition = (offset < skip_offset); - int in_index = - condition * offset + - (!condition) * - (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + - (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > - 0) + - skip_offset - 1); - //} - output[fIdx * reduced_filter_elem + offset] = - __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - //} - } -} - -void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int start) { - - //////INFO("*** TensorConvolution (output perforation) \n"); - // Event("Conv"); - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - // FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - Tensor *output; - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - // Event("H2F_start"); - convertToFP32(input); - convertToFP32(filter); - // Event("H2F_end"); - - long int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; // number of filters - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int rem_row = (h - start) % row > 0; - int h_eff = h - ((h - start) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + - 1; - int rem_col = (w - start) % col > 0; - int w_eff = w - ((w - start) / col) - rem_col; - - Tensor *new_output; - if (row > 1) { - output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - // total number of filter elem - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - const int gridSize = - (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - convToGemmPerfRow<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - row, start, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, - &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); - - new_output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(new_output, DEVICE); - - // interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks, 128>>>( - n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, - (float *)new_output->gpu_data, row, start); - cudaDeviceSynchronize(); - - freeTensor(output); - cudaFree(convData); - } else if (col > 1) { - output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - // total number of filter elem - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - col, start, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, - &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); - - new_output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(new_output, DEVICE); - - // interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateCol<<<numBlocks, 128>>>( - n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, - (float *)new_output->gpu_data, col, start); - cudaDeviceSynchronize(); - - freeTensor(output); - cudaFree(convData); - } else { - output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - // total number of filter elem - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - num_filter_elem, c * h * w); - checkCudaErrors(cudaDeviceSynchronize()); - // Do the matrix multiplication - // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem, - &alpha, convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, &beta, - (float *)output->gpu_data, h * w, c * h * w, n)); - - new_output = output; - cudaFree(convData); - } - - // Event("Conv_end"); //, true); - return new_output; -} - -__global__ void switchMatrixFull(int N, int n, int c, int h, int w, - float *old_data, float *new_data) { - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) { - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row) * w + col] = - old_data[((ch * n + n_new) * h + row) * w + col]; - } -} - -void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset) { - - //////INFO("*** TensorConvolution approximation \n"); - // Event("Conv"); - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - // FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - ////Event("H2F_start"); - convertToFP32(input); - convertToFP32(filter); - ////Event("H2F_end"); - - const int n = input->dims.dim_sizes[0]; - const int c = filter->dims.dim_sizes[0]; // number of filters - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - const int h = - (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = - (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + - 1; - const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - - Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(new_output, DEVICE); - ////INFO("batch: %d\n", n); - ////INFO("channels: %d\n", input->dims.dim_sizes[1]); - ////INFO("num_filters: %d\n", c); - ////INFO("kernel height: %d\n", KH); - ////INFO("kernel width: %d\n", KW); - ////INFO("num_filter_elem: %d\n", num_filter_elem); - ////INFO("vertical_stride: %d\n", vertical_stride); - ////INFO("horizontal_stride: %d\n", horizontal_stride); - ////INFO("output height: %d\n", h); - ////INFO("output width: %d\n", w); - if (row > 1) { - const int rem_row = (h - offset) % row > 0; - const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * - /// input->dims.dim_sizes[1] * h_eff * w)); - const int gridSize = - (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - convToGemmPerfRow<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - row, offset, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, - &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); - // interpolate - int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateRow<<<numBlocks, blocksize>>>( - n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, - (float *)new_output->gpu_data, row, offset); - cudaDeviceSynchronize(); - - freeTensor(output); - cudaFree(convData); - } else if (col > 1) { - const int rem_col = (w - offset) % col > 0; - const int w_eff = w - ((w - offset) / col) - rem_col; - - Tensor *output = (Tensor *)create4DTensor( - (cudnnDataType_t)float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * - /// input->dims.dim_sizes[1] * h * w_eff)); - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - col, offset, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, - &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); - - // interpolate - int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateCol<<<numBlocks, blocksize>>>( - n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, - (float *)new_output->gpu_data, col, offset); - cudaDeviceSynchronize(); - - freeTensor(output); - cudaFree(convData); - } else if (skip_every > 1) { - // reduced number after skipping - const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = - num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - - float *convData; - size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float *reducedFilter; - checkCudaErrors( - cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - - const int filtBlockSize = 128; - ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = - (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float)skip_every) / ((float)skip_every - 1); - //////INFO("fac: %f\n", fac); - const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); - const int gridSize = (n * h * w + blockSize - 1) / blockSize; - if (!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); - createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, - reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, - fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputRegular<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, reduced_filter_elem, skip_every, offset); - } else { - // ////INFO("IRREGULAR FILTERING\n"); - createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, - reduced_filter_elem, skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputIrregular<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - const float alpha = 1.0; - const float beta = 0.0; - checkCudaErrors(cublasSgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, - &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter, - reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w, - c * h * w, n)); - cudaFree(convData); - cudaFree(reducedFilter); - } else { - // INFO("FP32 BASELINE\n"); - Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(new_output, DEVICE); - - float *convData; - long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 128; - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * - /// input->dims.dim_sizes[1] * h * w)); - convToGemmFullInput<<<gridSize, blockSize>>>( - convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - skip_every, offset); // num_filter_elem); - checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h - * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float - *)new_output->gpu_data, h * w, c * h * w, n)); - */ - checkCudaErrors(cublasGemmEx( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, - &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, - CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data, - CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data); - - checkCudaErrors(cudaDeviceSynchronize()); - cudaFree(convData); - } - - // Event("Conv_end"); - return new_output; -} - -__global__ void switchMatrixHalf(int N, int n, int c, int h, int w, - __half *old_data, __half *new_data) { - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) { - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row) * w + col] = - old_data[((ch * n + n_new) * h + row) * w + col]; - } -} - -void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int conv_groups, int row, int col, int skip_every, - int offset) { - - // INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - // FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } - - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - // INFO("CONVERT\n"); - profileEvent("F2H_start"); - convertToFP16(input); - convertToFP16(filter); - profileEvent("F2H_end"); - // INFO("CONVERTED\n"); - const long int n = input->dims.dim_sizes[0]; - const long int c = filter->dims.dim_sizes[0]; // number of filters - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - const long int h = - (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const long int w = - (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + - 1; - const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - - Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(new_output, DEVICE); - // INFO("batch: %d\n", n); - // INFO("channels: %d\n", input->dims.dim_sizes[1]); - // INFO("num_filters: %d\n", c); - // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); - // INFO("num_filter_elem: %d\n", num_filter_elem); - // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - // INFO("vertical_stride: %d\n", vertical_stride); - // INFO("horizontal_stride: %d\n", horizontal_stride); - // INFO("output height: %d\n", h); - // INFO("output width: %d\n", w); - // INFO("skip_every: %d\n", skip_every); - if (row > 1) { - const int rem_row = (h - offset) % row > 0; - const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output_half = (Tensor *)create4DTensor( - (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output_half, DEVICE); - - __half *convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * - /// input->dims.dim_sizes[1] * h_eff * w)); - const int blockSize = 256; - const int gridSize = - (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - if (h * w <= 64) { - convToGemmPerfRowHalf2<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - } else { - convToGemmPerfRowHalf<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - } - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - if (h * w <= 64) { - checkCudaErrors(cublasGemmEx( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c, - num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w, - (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, - n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - checkCudaErrors(cublasHgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, - alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w, - (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, - (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n)); - } - // interpolate - int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if (h * w <= 64) { - approxInterpolateRowHalf2<<<numBlocks, blocksize>>>( - n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, row, offset); - } else { - approxInterpolateRowHalf<<<numBlocks, blocksize>>>( - n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, row, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); - cudaFree(convData); - } else if (col > 1) { - const int rem_col = (w - offset) % col > 0; - const int w_eff = w - ((w - offset) / col) - rem_col; - - Tensor *output_half = (Tensor *)create4DTensor( - (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output_half, DEVICE); - - __half *convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * - /// input->dims.dim_sizes[1] * h * w_eff)); - const int blockSize = 256; - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - if (h * w <= 64) { - convToGemmPerfColHalf2<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - } else { - convToGemmPerfColHalf<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - } - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - if (h * w <= 64) { - checkCudaErrors(cublasGemmEx( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c, - num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff, - (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, - n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } else { - checkCudaErrors(cublasHgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, - alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff, - (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, - (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n)); - } - // interpolate - int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if (h * w <= 64) { - approxInterpolateColHalf2<<<numBlocks, blocksize>>>( - n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, col, offset); - - } else { - approxInterpolateColHalf<<<numBlocks, blocksize>>>( - n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, col, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); - cudaFree(convData); - } else if (skip_every > 1) { - const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = - num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - - __half *convData; - size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - __half *reducedFilter; - checkCudaErrors( - cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); - - const int filtBlockSize = 256; - const int filtGridSize = - (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float)skip_every) / ((float)skip_every - 1); - const int blockSize = 256; - // const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - if (c * num_filter_elem < - 500000) { // 250) {//c * reduced_filter_elem < 150000) { - if (!(KH * KW % skip_every)) { - // INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, - reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, - fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = - (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, - w, vertical_stride, horizontal_stride, reduced_filter_elem, - skip_every, offset); - } else { - // INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, - reduced_filter_elem, skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = - (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - // convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, - w, vertical_stride, horizontal_stride, reduced_filter_elem, - skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, - alpha_half, convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, beta_half, - (__half *)new_output->gpu_half_data, h * w, c * h * w, n)); - } else { - Tensor *output_half = (Tensor *)create4DTensor( - (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(output_half, DEVICE); - - if (!(KH * KW % skip_every)) { - // INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, - reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, - fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = - (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular2<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, - w, vertical_stride, horizontal_stride, reduced_filter_elem, - skip_every, offset); - } else { - // INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( - reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, - reduced_filter_elem, skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = - (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, - w, vertical_stride, horizontal_stride, reduced_filter_elem, - skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, - reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w, - reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half, - (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); - } - - cudaFree(convData); - cudaFree(reducedFilter); - } else { - // INFO("BASELINE\n"); - Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - changeTensorPlacement(output, DEVICE); - __half *convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 256; - const int gridSize = - (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - // convToGemmHalf - convToGemmHalfInputNew<<<gridSize, blockSize>>>( - convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, - num_filter_elem, skip_every, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - checkCudaErrors(cublasGemmEx( - cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, - alpha_half, convData, CUDA_R_16F, n * h * w, - (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half, - (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, - (__half *)output->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output); - cudaFree(convData); - } - // INFO("CONV DONE\n"); - profileEvent("H2F_start"); - convertToFP32_offline(new_output); - // convertToFP32(input); - // convertToFP32(filter); - profileEvent("H2F_end"); - // profileEvent("#Conv_end"); - // INFO("CONVOLUTION END\n"); - return new_output; -} - -} // end of Extern C