diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h index 987627e5fb8227e853f66b6ae8c8d33c1b40a638..14f1b058a194cd7c971eaedb9bab0573d21ac64b 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h @@ -2,72 +2,7 @@ -__global__ void depthwise_conv8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - - const int num = 8; - - const int b = blockIdx.x * num; - const int m = blockIdx.y; //current filter/channel - - const int tx = threadIdx.x; - - const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; - const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; - - const float* weights = &w[m * KH * KW]; - float c0 = 0; - float c1 = 0; - float c2 = 0; - float c3 = 0; - float c4 = 0; - float c5 = 0; - float c6 = 0; - float c7 = 0; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - - } - } - - y4d(b, m, 0, tx) = c0; - y4d(b + 1, m, 0, tx) = c1; - y4d(b + 2, m, 0, tx) = c2; - y4d(b + 3, m, 0, tx) = c3; - y4d(b + 4, m, 0, tx) = c4; - y4d(b + 5, m, 0, tx) = c5; - y4d(b + 6, m, 0, tx) = c6; - y4d(b + 7, m, 0, tx) = c7; - - #undef y4d - #undef x4d -} __global__ void depthwise_conv(float* const __restrict__ y, @@ -124,60 +59,6 @@ __global__ void depthwise_conv(float* const __restrict__ y, #undef x4d } -__global__ void depthwise_conv12(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - - const int num = 12; - - const int b = num * blockIdx.x; - const int m = blockIdx.y; //current filter/channel - - const int tx = threadIdx.x; - - const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; - const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; - - float C[num] = { 0 }; - - const float* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - #pragma unroll - for (int i = 0; i < num; i++) { - //if(b + i < B) - C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; - } - - } - } - - #pragma unroll - for (int i = 0; i < num; i++) { - //if(b + i < B) - y4d(b + i, m, 0, tx) = C[i]; - - } - - -#undef y4d -#undef x4d -} __global__ void depthwise_convNew(float* const __restrict__ y, const float* const __restrict__ x, @@ -400,6 +281,165 @@ __global__ void depthwise_convNew8_half(__half* const __restrict__ y, #undef x4d } +__global__ void depthwise_convNew8_half1(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 8; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half c0 = 0; + __half c1 = 0; + __half c2 = 0; + __half c3 = 0; + __half c4 = 0; + __half c5 = 0; + __half c6 = 0; + __half c7 = 0; + + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0); + } + } + + if(b + 1 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1); + } + } + } + + if(b + 2 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2); + } + } + } + + if(b + 3 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3); + } + } + } + + if(b + 4 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4); + } + } + } + + if(b + 5 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5); + } + } + } + + if(b + 6 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6); + } + } + } + + if(b + 7 < B){ + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7); + } + } + } + + + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + } + + #undef y4d + #undef x4d +} __global__ void depthwise_convNew12(float* const __restrict__ y, @@ -507,6 +547,374 @@ __global__ void depthwise_convNew12(float* const __restrict__ y, } +__global__ void depthwise_convNew12_half(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 12; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half c0 = 0; + __half c1 = 0; + __half c2 = 0; + __half c3 = 0; + __half c4 = 0; + __half c5 = 0; + __half c6 = 0; + __half c7 = 0; + __half c8 = 0; + __half c9 = 0; + __half c10 = 0; + __half c11 = 0; + + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0); + if(b + 1 < B) + c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1); + if(b + 2 < B) + c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2); + if(b + 3 < B) + c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3); + if(b + 4 < B) + c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4); + if(b + 5 < B) + c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5); + if(b + 6 < B) + c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6); + if(b + 7 < B) + c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7); + if(b + 8 < B) + c8 = __hfma(x4d(b + 8, m, start_h + p, start_w + q), weights[k], c8); + if(b + 9 < B) + c9 = __hfma(x4d(b + 9, m, start_h + p, start_w + q), weights[k], c9); + if(b + 10 < B) + c10 = __hfma(x4d(b + 10, m, start_h + p, start_w + q), weights[k], c10); + if(b + 11 < B) + c11 = __hfma(x4d(b + 11, m, start_h + p, start_w + q), weights[k], c11); + + + } + } + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + if(b + 8 < B) + y4d(b + 8, m, 0, tx) = c8; + if(b + 9 < B) + y4d(b + 9, m, 0, tx) = c9; + if(b + 10 < B) + y4d(b + 10, m, 0, tx) = c10; + if(b + 11 < B) + y4d(b + 11, m, 0, tx) = c11; + + } + + #undef y4d + #undef x4d +} + + +__global__ void depthwise_convNew8_half2(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 8; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half2 c0 = __half2half2(0); + __half2 c1 = __half2half2(0); + __half2 c2 = __half2half2(0); + __half2 c3 = __half2half2(0); + + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + + __half2 t1; + __half2 t2; + __half2 t3; + __half2 t4; + if(b + 7 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q)); + } + else if(b + 6 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); + + } + else if(b + 5 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); + } + else if(b + 4 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); + + } + else if(b + 3 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + } + else if(b + 2 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); + + } + else if(b + 1 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + } + else{ + t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); + + } + + + c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); + c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); + c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); + c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); + + } + } + + y4d(b, m, 0, tx) = __high2half(c0); + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = __low2half(c0); + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = __high2half(c1); + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = __low2half(c1); + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = __high2half(c2); + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = __low2half(c2); + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = __high2half(c3); + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = __low2half(c3); + } + + #undef y4d + #undef x4d +} + + +//When stride is 1 +__global__ void depthwise_conv4_half3(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int C_dim, const int H_dim, const int W_dim) +{ + +#define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] +#define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 1; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); + + if (m < M) { + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) - H_pad; + const int start_w = (tx % W_out) - W_pad; + + const int bstart_h = (blockIdx.y * blockDim.x % (H_out * W_out)) / W_out - H_pad; + const int bstart_w = (blockIdx.y * blockDim.x % (H_out * W_out)) % W_out - H_pad; + const int bstartm = (blockIdx.y * blockDim.x / (H_out * W_out)); + + extern __shared__ __half xdata[]; + + for (int i = 0; i < C_dim * H_dim * W_dim; i += blockDim.x) { + if (i / (H_dim * W_dim) + bstartm < M && (i % (H_dim * W_dim)) / W_dim + bstart_h > -1 && + (i % (H_dim * W_dim)) / W_dim + bstart_h < H && (i % (H_dim * W_dim)) % W_dim + bstart_w > -1 && + (i % (H_dim * W_dim)) % W_dim + bstart_w < W) { + xdata[i] = x4d(b, i / (H_dim * W_dim) + bstartm, (i % (H_dim * W_dim)) / W_dim + bstart_h, + (i % (H_dim * W_dim)) % W_dim + bstart_w); + } + } + __syncthreads(); + + __half c0; + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + + __half t1; + + int total = C_dim * H_dim * W_dim; + t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim + + start_w + q - bstart_w]; + + + c0 = __hfma(t1, weights[k], c0); + } + } + + y4d(b, m, 0, tx) = c0; + + + } + +#undef y4d +#undef x4d +} + + +__global__ void depthwise_convNew4_half2(__half* const __restrict__ y, + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) +{ + + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + + const int num = 4; + + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half2 c0 = __half2half2(0); + __half2 c1 = __half2half2(0); + + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + + __half2 t1; + __half2 t2; + if(b + 3 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); + } + else if(b + 2 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); + + } + else if(b + 1 < B){ + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); + } + else{ + t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); + + } + + + c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); + c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); + + } + } + + y4d(b, m, 0, tx) = __high2half(c0); + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = __low2half(c0); + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = __high2half(c1); + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = __low2half(c1); + + } + + #undef y4d + #undef x4d +} + + void* tensorConvCutlass(void* input_ptr, void* filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, @@ -578,7 +986,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, */ int blockSize; - blockSize = 128; + blockSize = 64; dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); dim3 block(blockSize); @@ -797,10 +1205,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, const int KW = filter->dims.dim_sizes[3]; int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - + output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, CUDNN_TENSOR_NCHW, n, c, h, w); @@ -816,10 +1223,10 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int blockSize; blockSize = 128; - + dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); dim3 block(blockSize); - depthwise_convNew8_half<<<grid, block>>> ((__half*)output_half->gpu_data, + depthwise_convNew8_half2<<<grid, block>>> ((__half*)output_half->gpu_data, (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data, input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], @@ -934,6 +1341,213 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, } +void* tensorHalfConvCutlass2(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups){ + + INFO("*** TensorHConvolution \n"); + profileEvent("#Conv"); + + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; + + cudnnConvolutionDescriptor_t convDesc; + cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionMode_t mode; + if (conv_mode == 0) + mode = CUDNN_CONVOLUTION; + else if (conv_mode == 1) + mode = CUDNN_CROSS_CORRELATION; + + // FIXIT: Need to be more aware of the implications of alpha and beta + float alpha = 1.0f, beta = 0.0f; + // NOTE: compute in half precision + cudnnDataType_t computeType = CUDNN_DATA_HALF; + + // NOTE: Moving inputs to GPU global memory + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + + /***** CONVERSIONS from FP32 to FP16 - on the GPU */ + size_t* input_dims = input->dims.dim_sizes; + size_t* filter_dims = filter->dims.dim_sizes; + + + profileEvent("F2H_start"); + + Tensor* input_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, + input_dims[0], input_dims[1], + input_dims[2], input_dims[3]); + + + changeTensorPlacement(input_half, DEVICE); + Tensor* filter_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, + filter_dims[0], filter_dims[1], + filter_dims[2], filter_dims[3]); + + + changeTensorPlacement(filter_half, DEVICE); + + + f2h((float*)input->gpu_data, input->num_elems, (half*)input_half->gpu_data); + f2h((float*)filter->gpu_data, filter->num_elems, (half*)filter_half->gpu_data); + + + /******* END OF INPUT DATA CONVERSIONS*/ + profileEvent("F2H_end"); + + Tensor* output; + Tensor* output_half; + + + if (conv_groups > 1 && horizontal_stride == 1 && vertical_stride == 1) { + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + + output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // FIXIT: more checks for data types needed + output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, + CUDNN_TENSOR_NCHW, n, c, h, w); + + + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + int blockSize; + blockSize = 128; + + dim3 grid(((n + 3) / 4), (c * h * w + blockSize - 1) / blockSize); + dim3 block(blockSize); + int C_dim = blockSize / (h * w) + 1 + 1; + int H_dim = blockSize % (h * w) / w + 1 + KH + 1; + int W_dim = blockSize % (h * w) % w + 1 + KW + 1; + depthwise_conv4_half3 << <grid, block, sizeof(__half)* C_dim* H_dim* W_dim >> > ((__half*)output_half->gpu_data, + (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, + vertical_pad, horizontal_pad, C_dim, H_dim, W_dim); + cudaDeviceSynchronize(); + + + } + else { + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); + + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } + + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); + + + // FIXIT: Think if upscaling values need to be configurable? + // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_desc, + filter->filter_desc, + &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + + + output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // FIXIT: more checks for data types needed + output_half = (Tensor*)create4DTensor(CUDNN_DATA_HALF, + CUDNN_TENSOR_NCHW, n, c, h, w); + + + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); + + + // NOTE: The following algo works with TRUE half precision + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + + + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input_half->tensor_desc, + filter_half->filter_desc, + convDesc, + output_half->tensor_desc, + convAlgo, + &workspace_size)); + + // Allocating memory for the convolution workspace + DEBUG("workspace size = %d \n", workspace_size); + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + + + + + checkCUDNN(cudnnConvolutionForward(cudnnHandle, + &alpha, + input_half->tensor_desc, + input_half->gpu_data, + filter_half->filter_desc, + filter_half->gpu_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, + output_half->tensor_desc, + output_half->gpu_data)); + + } + + profileEvent("H2F_start"); + + // NOTE: Transforming half precision output to single precision + h2f((half*)output_half->gpu_data, output->num_elems, (float*)output->gpu_data); + + profileEvent("H2F_end"); + + profileEvent("#Conv_end"); + + + freeTensor(input_half); + freeTensor(filter_half); + freeTensor(output_half); + + return output; + +} + // Perforated Tensor Conv with 'perforation_rate' parameter void* tensorConvPerf(void* input, void* filter,