diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h index e81a78860bc1cdb412b823f5fff58cea4c5048fa..987627e5fb8227e853f66b6ae8c8d33c1b40a638 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h @@ -3,507 +3,507 @@ __global__ void depthwise_conv8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 8; - - const int b = blockIdx.x * num; - const int m = blockIdx.y; //current filter/channel - - const int tx = threadIdx.x; - - const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; - const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; - - - const float* weights = &w[m * KH * KW]; - float c0 = 0; - float c1 = 0; - float c2 = 0; - float c3 = 0; - float c4 = 0; - float c5 = 0; - float c6 = 0; - float c7 = 0; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - - } - } - - y4d(b, m, 0, tx) = c0; - y4d(b + 1, m, 0, tx) = c1; - y4d(b + 2, m, 0, tx) = c2; - y4d(b + 3, m, 0, tx) = c3; - y4d(b + 4, m, 0, tx) = c4; - y4d(b + 5, m, 0, tx) = c5; - y4d(b + 6, m, 0, tx) = c6; - y4d(b + 7, m, 0, tx) = c7; - - #undef y4d - #undef x4d + const int num = 8; + + const int b = blockIdx.x * num; + const int m = blockIdx.y; //current filter/channel + + const int tx = threadIdx.x; + + const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; + const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; + + + const float* weights = &w[m * KH * KW]; + float c0 = 0; + float c1 = 0; + float c2 = 0; + float c3 = 0; + float c4 = 0; + float c5 = 0; + float c6 = 0; + float c7 = 0; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; + + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { + + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; + + } + } + + y4d(b, m, 0, tx) = c0; + y4d(b + 1, m, 0, tx) = c1; + y4d(b + 2, m, 0, tx) = c2; + y4d(b + 3, m, 0, tx) = c3; + y4d(b + 4, m, 0, tx) = c4; + y4d(b + 5, m, 0, tx) = c5; + y4d(b + 6, m, 0, tx) = c6; + y4d(b + 7, m, 0, tx) = c7; + + #undef y4d + #undef x4d } __global__ void depthwise_conv(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, const int H_stride, const int W_stride, const int start_batch) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 1; + const int num = 1; - const int b = num * blockIdx.x + start_batch; - const int m = blockIdx.y; //current filter/channel + const int b = num * blockIdx.x + start_batch; + const int m = blockIdx.y; //current filter/channel - const int tx = threadIdx.x; + const int tx = threadIdx.x; - const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; - const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; + const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; + const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; - float C[num] = { 0 }; + float C[num] = { 0 }; - const float* weights = &w[m * KH * KW]; + const float* weights = &w[m * KH * KW]; - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - #pragma unroll - for (int i = 0; i < num; i++) { - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { + #pragma unroll + for (int i = 0; i < num; i++) { + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; - } + C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; + } - } - } + } + } - #pragma unroll - for (int i = 0; i < num; i++) { - if(b + i < B) - y4d(b + i, m, 0, tx) = C[i]; + #pragma unroll + for (int i = 0; i < num; i++) { + if(b + i < B) + y4d(b + i, m, 0, tx) = C[i]; - } + } - #undef y4d - #undef x4d + #undef y4d + #undef x4d } __global__ void depthwise_conv12(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 12; + const int num = 12; - const int b = num * blockIdx.x; - const int m = blockIdx.y; //current filter/channel + const int b = num * blockIdx.x; + const int m = blockIdx.y; //current filter/channel - const int tx = threadIdx.x; + const int tx = threadIdx.x; - const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; - const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; + const int start_h = (threadIdx.x / W_out) * H_stride - H_pad; + const int start_w = (threadIdx.x % W_out) * W_stride - W_pad; - float C[num] = { 0 }; + float C[num] = { 0 }; - const float* weights = &w[m * KH * KW]; + const float* weights = &w[m * KH * KW]; - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - #pragma unroll - for (int i = 0; i < num; i++) { - //if(b + i < B) - C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; - } + #pragma unroll + for (int i = 0; i < num; i++) { + //if(b + i < B) + C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; + } - } - } + } + } - #pragma unroll - for (int i = 0; i < num; i++) { - //if(b + i < B) - y4d(b + i, m, 0, tx) = C[i]; + #pragma unroll + for (int i = 0; i < num; i++) { + //if(b + i < B) + y4d(b + i, m, 0, tx) = C[i]; - } + } - #undef y4d - #undef x4d +#undef y4d +#undef x4d } __global__ void depthwise_convNew(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 12; + const int num = 12; - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; - float C[num] = { 0 }; + float C[num] = { 0 }; - const float* weights = &w[m * KH * KW]; + const float* weights = &w[m * KH * KW]; - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - #pragma unroll - for (int i = 0; i < num; i++) { - if(b + i < B) - C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; - } + #pragma unroll + for (int i = 0; i < num; i++) { + if(b + i < B) + C[i] += x4d(b + i, m, start_h + p, start_w + q) * weights[k]; + } - } - } + } + } - #pragma unroll - for (int i = 0; i < num; i++) { - if(b + i < B) - y4d(b + i, m, 0, tx) = C[i]; + #pragma unroll + for (int i = 0; i < num; i++) { + if(b + i < B) + y4d(b + i, m, 0, tx) = C[i]; - } + } - #undef y4d - #undef x4d + #undef y4d + #undef x4d } __global__ void depthwise_convNew8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 8; + const int num = 8; - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; - - float c0 = 0; - float c1 = 0; - float c2 = 0; - float c3 = 0; - float c4 = 0; - float c5 = 0; - float c6 = 0; - float c7 = 0; + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + float c0 = 0; + float c1 = 0; + float c2 = 0; + float c3 = 0; + float c4 = 0; + float c5 = 0; + float c6 = 0; + float c7 = 0; - const float* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - if(b + 1 < B) - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - if(b + 2 < B) - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - if(b + 3 < B) - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - if(b + 4 < B) - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - if(b + 5 < B) - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - if(b + 6 < B) - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - if(b + 7 < B) - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - + const float* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - } - } + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) - y4d(b + 1, m, 0, tx) = c1; + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + if(b + 1 < B) + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; if(b + 2 < B) - y4d(b + 2, m, 0, tx) = c2; + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; if(b + 3 < B) - y4d(b + 3, m, 0, tx) = c3; + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; if(b + 4 < B) - y4d(b + 4, m, 0, tx) = c4; + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; if(b + 5 < B) - y4d(b + 5, m, 0, tx) = c5; + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; if(b + 6 < B) - y4d(b + 6, m, 0, tx) = c6; + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; if(b + 7 < B) - y4d(b + 7, m, 0, tx) = c7; - } + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; + + + } + } + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + } - #undef y4d - #undef x4d + #undef y4d + #undef x4d } __global__ void depthwise_convNew8_half(__half* const __restrict__ y, - const __half* const __restrict__ x, - const __half* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const __half* const __restrict__ x, + const __half* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 8; + const int num = 8; - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; - - __half c0 = 0; - __half c1 = 0; - __half c2 = 0; - __half c3 = 0; - __half c4 = 0; - __half c5 = 0; - __half c6 = 0; - __half c7 = 0; + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + __half c0 = 0; + __half c1 = 0; + __half c2 = 0; + __half c3 = 0; + __half c4 = 0; + __half c5 = 0; + __half c6 = 0; + __half c7 = 0; - const __half* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0); - if(b + 1 < B) - c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1); - if(b + 2 < B) - c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2); - if(b + 3 < B) - c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3); - if(b + 4 < B) - c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4); - if(b + 5 < B) - c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5); - if(b + 6 < B) - c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6); - if(b + 7 < B) - c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7); - + const __half* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - } - } + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) - y4d(b + 1, m, 0, tx) = c1; + c0 = __hfma(x4d(b, m, start_h + p, start_w + q), weights[k], c0); + if(b + 1 < B) + c1 = __hfma(x4d(b + 1, m, start_h + p, start_w + q), weights[k], c1); if(b + 2 < B) - y4d(b + 2, m, 0, tx) = c2; + c2 = __hfma(x4d(b + 2, m, start_h + p, start_w + q), weights[k], c2); if(b + 3 < B) - y4d(b + 3, m, 0, tx) = c3; + c3 = __hfma(x4d(b + 3, m, start_h + p, start_w + q), weights[k], c3); if(b + 4 < B) - y4d(b + 4, m, 0, tx) = c4; + c4 = __hfma(x4d(b + 4, m, start_h + p, start_w + q), weights[k], c4); if(b + 5 < B) - y4d(b + 5, m, 0, tx) = c5; + c5 = __hfma(x4d(b + 5, m, start_h + p, start_w + q), weights[k], c5); if(b + 6 < B) - y4d(b + 6, m, 0, tx) = c6; + c6 = __hfma(x4d(b + 6, m, start_h + p, start_w + q), weights[k], c6); if(b + 7 < B) - y4d(b + 7, m, 0, tx) = c7; - } + c7 = __hfma(x4d(b + 7, m, start_h + p, start_w + q), weights[k], c7); + + + } + } + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + } - #undef y4d - #undef x4d + #undef y4d + #undef x4d } __global__ void depthwise_convNew12(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) + const float* const __restrict__ x, + const float* const __restrict__ w, + const int B, const int M, + const int H, const int W, const int KH, + const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, + const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] + #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] + #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] - const int num = 12; + const int num = 12; - const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); + const int b = num * blockIdx.x; + const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); - - const int start_h = (tx / W_out) * H_stride - H_pad; - const int start_w = (tx % W_out) * W_stride - W_pad; - - float c0 = 0; - float c1 = 0; - float c2 = 0; - float c3 = 0; - float c4 = 0; - float c5 = 0; - float c6 = 0; - float c7 = 0; - float c8 = 0; - float c9 = 0; - float c10 = 0; - float c11 = 0; + if(m < M){ + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + + const int start_h = (tx / W_out) * H_stride - H_pad; + const int start_w = (tx % W_out) * W_stride - W_pad; + + float c0 = 0; + float c1 = 0; + float c2 = 0; + float c3 = 0; + float c4 = 0; + float c5 = 0; + float c6 = 0; + float c7 = 0; + float c8 = 0; + float c9 = 0; + float c10 = 0; + float c11 = 0; - const float* weights = &w[m * KH * KW]; - - for (int k = 0; k < KH * KW; k++) { - int p = k / KW; - int q = k % KW; - - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - if(b + 1 < B) - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - if(b + 2 < B) - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - if(b + 3 < B) - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - if(b + 4 < B) - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - if(b + 5 < B) - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - if(b + 6 < B) - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - if(b + 7 < B) - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - if(b + 8 < B) - c8 += x4d(b + 8, m, start_h + p, start_w + q) * weights[k]; - if(b + 9 < B) - c9 += x4d(b + 9, m, start_h + p, start_w + q) * weights[k]; - if(b + 10 < B) - c10 += x4d(b + 10, m, start_h + p, start_w + q) * weights[k]; - if(b + 11 < B) - c11 += x4d(b + 11, m, start_h + p, start_w + q) * weights[k]; - + const float* weights = &w[m * KH * KW]; + + for (int k = 0; k < KH * KW; k++) { + int p = k / KW; + int q = k % KW; - } - } + if (start_h + p > -1 && start_h + p < H && + start_w + q > -1 && start_w + q < W) { - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) - y4d(b + 1, m, 0, tx) = c1; + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + if(b + 1 < B) + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; if(b + 2 < B) - y4d(b + 2, m, 0, tx) = c2; + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; if(b + 3 < B) - y4d(b + 3, m, 0, tx) = c3; + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; if(b + 4 < B) - y4d(b + 4, m, 0, tx) = c4; + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; if(b + 5 < B) - y4d(b + 5, m, 0, tx) = c5; + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; if(b + 6 < B) - y4d(b + 6, m, 0, tx) = c6; + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; if(b + 7 < B) - y4d(b + 7, m, 0, tx) = c7; + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; if(b + 8 < B) - y4d(b + 8, m, 0, tx) = c8; + c8 += x4d(b + 8, m, start_h + p, start_w + q) * weights[k]; if(b + 9 < B) - y4d(b + 9, m, 0, tx) = c9; + c9 += x4d(b + 9, m, start_h + p, start_w + q) * weights[k]; if(b + 10 < B) - y4d(b + 10, m, 0, tx) = c10; + c10 += x4d(b + 10, m, start_h + p, start_w + q) * weights[k]; if(b + 11 < B) - y4d(b + 11, m, 0, tx) = c11; + c11 += x4d(b + 11, m, start_h + p, start_w + q) * weights[k]; + + + } + } + + y4d(b, m, 0, tx) = c0; + if(b + 1 < B) + y4d(b + 1, m, 0, tx) = c1; + if(b + 2 < B) + y4d(b + 2, m, 0, tx) = c2; + if(b + 3 < B) + y4d(b + 3, m, 0, tx) = c3; + if(b + 4 < B) + y4d(b + 4, m, 0, tx) = c4; + if(b + 5 < B) + y4d(b + 5, m, 0, tx) = c5; + if(b + 6 < B) + y4d(b + 6, m, 0, tx) = c6; + if(b + 7 < B) + y4d(b + 7, m, 0, tx) = c7; + if(b + 8 < B) + y4d(b + 8, m, 0, tx) = c8; + if(b + 9 < B) + y4d(b + 9, m, 0, tx) = c9; + if(b + 10 < B) + y4d(b + 10, m, 0, tx) = c10; + if(b + 11 < B) + y4d(b + 11, m, 0, tx) = c11; - } + } - #undef y4d - #undef x4d + #undef y4d + #undef x4d } @@ -512,218 +512,219 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups){ - llvm_hpvm_initTensorRt(0); + llvm_hpvm_initTensorRt(0); - INFO("*** TensorConvolution \n"); - profileEvent("Conv"); + INFO("*** TensorConvolution \n"); + profileEvent("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty - if (conv_groups == 0) { - conv_groups = 1; - } + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } - Tensor* output; + Tensor* output; - if (conv_groups > 32) { - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - int n, c, h, w; // output dimensions - n = input->dims.dim_sizes[0]; - c = input->dims.dim_sizes[1]; - const int KH = filter->dims.dim_sizes[2]; - const int KW = filter->dims.dim_sizes[3]; - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor - - - /* - if (c > 255) { - dim3 grid((n / 16), c); - dim3 block(h * w); - depthwise_conv << <grid, block >> > ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - - }*/ - - /* - dim3 grid((n / 12), c); - dim3 block(h * w); - depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - if(n % 12 > 0){ - dim3 grid2((n % 12), c); - dim3 block(h * w); - depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); - } - */ + if (conv_groups > 32) { + // TODO: Support other cases; + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + int n, c, h, w; // output dimensions + n = input->dims.dim_sizes[0]; + c = input->dims.dim_sizes[1]; + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + + + /* + if (c > 255) { + dim3 grid((n / 16), c); + dim3 block(h * w); + depthwise_conv << <grid, block >> > ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + + }*/ + + /* + dim3 grid((n / 12), c); + dim3 block(h * w); + depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + if(n % 12 > 0){ + dim3 grid2((n % 12), c); + dim3 block(h * w); + depthwise_conv <<<grid, block >>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12)); + } + */ - int blockSize; - blockSize = 128; + int blockSize; + blockSize = 128; - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); - dim3 block(blockSize); - depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - - } - else { - - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; - cudnnConvolutionMode_t mode; - if (conv_mode == 0) - mode = CUDNN_CONVOLUTION; - else if (conv_mode == 1) - mode = CUDNN_CROSS_CORRELATION; - - // FIXIT: Need to be more aware of the implications of alpha and beta - float alpha = 1.0f, beta = 0.0f; - - // TODO: Support other cases; - hostToDeviceCopy(input); - hostToDeviceCopy(filter); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); - - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - - // NOTE: Adding support for grouped convolution - checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - - cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision + dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + dim3 block(blockSize); + depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, + (float*)input->gpu_data, (float*)filter->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); + } + else { + cudnnConvolutionDescriptor_t convDesc; + cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionMode_t mode; + if (conv_mode == 0) + mode = CUDNN_CONVOLUTION; + else if (conv_mode == 1) + mode = CUDNN_CROSS_CORRELATION; - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + // FIXIT: Need to be more aware of the implications of alpha and beta + float alpha = 1.0f, beta = 0.0f; - if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if (input->data_format == CUDNN_TENSOR_NHWC) { - DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else - ERROR("Unsupported Tensor Type"); + // TODO: Support other cases; + hostToDeviceCopy(input); + hostToDeviceCopy(filter); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - if (convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); + cudnnDataType_t computeType = CUDNN_DATA_FLOAT; + // FIXIT: Think if upscaling values need to be configurable? + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_desc, + filter->filter_desc, + &n, &c, &h, &w)); - DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - // FIXIT: Algo shouldn't be hardcoded - convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { + DEBUG("* NHWC Format \n"); + output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } + else + ERROR("Unsupported Tensor Type"); - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor - // Allocating memory for the convolution workspace - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - DEBUG("workspace size = %d \n", workspace_size); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - } - cudaDeviceSynchronize(); - profileEvent("Conv_end", true); + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, + &convAlgo)); + + + DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); + + // FIXIT: Algo shouldn't be hardcoded + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - #ifdef ERROR_INJECTION_ENABLED + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input->tensor_desc, + filter->filter_desc, + convDesc, + output->tensor_desc, + convAlgo, + &workspace_size)); - if (op_counter >= total_ops) { - ERROR("No accuracy flag found \n"); - } + // Allocating memory for the convolution workspace + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + DEBUG("workspace size = %d \n", workspace_size); - int op_acc = op_accuracies[op_counter]; - // Skip errorInjection if explicitly requested - if (skip_tensors.find(op_counter) != skip_tensors.end()) { - op_acc = 0; - } + checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, + input->gpu_data, filter->filter_desc, filter->gpu_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, output->tensor_desc, output->gpu_data)); + } + + cudaDeviceSynchronize(); + profileEvent("Conv_end", true); - void* error_norms = tensorAddError(output, op_acc); - add_norms(error_norms, "tensorConv", op_acc); - add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); - op_counter++; + #ifdef ERROR_INJECTION_ENABLED - #endif + if (op_counter >= total_ops) { + ERROR("No accuracy flag found \n"); + } - return output; + int op_acc = op_accuracies[op_counter]; + + // Skip errorInjection if explicitly requested + if (skip_tensors.find(op_counter) != skip_tensors.end()) { + op_acc = 0; + } + + void* error_norms = tensorAddError(output, op_acc); + add_norms(error_norms, "tensorConv", op_acc); + add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); + + op_counter++; + + #endif + + return output; } @@ -819,99 +820,102 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); dim3 block(blockSize); depthwise_convNew8_half<<<grid, block>>> ((__half*)output_half->gpu_data, - (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + (__half*)input_half->gpu_data, (__half*)filter_half->gpu_data, + input->dims.dim_sizes[0], input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], + KH, KW, h, w, + vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); cudaDeviceSynchronize(); } - else{ - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); + else{ + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ - conv_groups = 1; - } + //FIXME: Current hack to preserve backward compatibilty + if(conv_groups == 0){ + conv_groups = 1; + } - // NOTE: Adding support for grouped convolution - checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); + // NOTE: Adding support for grouped convolution + checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + // FIXIT: Think if upscaling values need to be configurable? + // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, + vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, + input->tensor_desc, + filter->filter_desc, + &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - // FIXIT: more checks for data types needed - output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + // FIXIT: more checks for data types needed + output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF, + CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - // NOTE: Necessary to insert the above call for every output tensor + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], + output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); + if(convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); - // NOTE: The following algo works with TRUE half precision - convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // NOTE: The following algo works with TRUE half precision + convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input_half->tensor_desc, - filter_half->filter_desc, - convDesc, - output_half->tensor_desc, - convAlgo, - &workspace_size)); - - // Allocating memory for the convolution workspace - DEBUG("workspace size = %d \n", workspace_size); - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input_half->tensor_desc, - input_half->gpu_data, - filter_half->filter_desc, - filter_half->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, - output_half->tensor_desc, - output_half->gpu_data)); + size_t workspace_size; + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + input_half->tensor_desc, + filter_half->filter_desc, + convDesc, + output_half->tensor_desc, + convAlgo, + &workspace_size)); + + // Allocating memory for the convolution workspace + DEBUG("workspace size = %d \n", workspace_size); + void* workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + + + + + checkCUDNN(cudnnConvolutionForward(cudnnHandle, + &alpha, + input_half->tensor_desc, + input_half->gpu_data, + filter_half->filter_desc, + filter_half->gpu_data, + convDesc, convAlgo, workspace, workspace_size, + &beta, + output_half->tensor_desc, + output_half->gpu_data)); } + profileEvent("H2F_start"); // NOTE: Transforming half precision output to single precision @@ -919,7 +923,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, profileEvent("H2F_end"); - profileEvent("#tensorHalfConv_end"); + profileEvent("#Conv_end"); freeTensor(input_half);