Skip to content
Snippets Groups Projects
Commit 16585eb6 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Removing commented-out code from approx_techniques.cu (hpvm-tensor-rt)

parent a42f98e9
No related branches found
No related tags found
No related merge requests found
...@@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp ...@@ -169,19 +169,13 @@ __global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ outp
if(n < N) { //is thread id within bounds? if(n < N) { //is thread id within bounds?
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
//const int ki = c * KH * KW + i;
//const int kj = c * KH * KW + j;
const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
if((filter_elem_num - skip_offset) % skip_every) { if((filter_elem_num - skip_offset) % skip_every) {
const int condition = (filter_elem_num < skip_offset); const int condition = (filter_elem_num < skip_offset);
const int output_col = condition * filter_elem_num const int output_col = condition * filter_elem_num
+ (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
- ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
//if(filter_elem_num % skip_every != skip_offset) {
// int output_col = filter_elem_num -
// (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
//if(skip_every == 1)
// output_col = filter_elem_num;
const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
//((output_col*N + n) * H_out + h) * W_out + w; //((output_col*N + n) * H_out + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
...@@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out ...@@ -213,22 +207,16 @@ __global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ out
if(n < N) { //is thread id within bounds? if(n < N) { //is thread id within bounds?
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
//const int ki = c * KH * KW + i;
//const int kj = c * KH * KW + j; const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
if((filter_elem_num - skip_offset) % skip_every) { if((filter_elem_num - skip_offset) % skip_every) {
const int condition = (filter_elem_num < skip_offset); const int condition = (filter_elem_num < skip_offset);
const int output_col = condition * filter_elem_num const int output_col = condition * filter_elem_num
+ (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every)
- ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - ((filter_elem_num + 1 - skip_offset) % skip_every > 0));
//if(filter_elem_num % skip_every != skip_offset) {
// int output_col = filter_elem_num -
// (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset));
//if(skip_every == 1)
// output_col = filter_elem_num;
const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; const int out_index = ((output_col * N + n) * H_out + h) * W_out + w;
//((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w;
//((output_col*N + n) * H_out + h) * W_out + w
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
else else
...@@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output, ...@@ -288,15 +276,13 @@ __global__ void convToGemmPerfRow(float * const __restrict__ output,
} }
const int inH = h_index * V_stride - V_pad; const int inH = h_index * V_stride - V_pad;
const int inW = w * H_stride - H_pad; //input width index (col number) const int inW = w * H_stride - H_pad; //input width index (col number)
//#pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element
const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
else else
output[out_index] = 0; output[out_index] = 0;
...@@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output, ...@@ -357,11 +343,7 @@ __global__ void convToGemmPerfCol(float * const __restrict__ output,
} }
const int inW = w_index * H_stride - H_pad; const int inW = w_index * H_stride - H_pad;
const int inH = h * V_stride - V_pad; //input height index (row number) const int inH = h * V_stride - V_pad; //input height index (row number)
//#pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
...@@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, ...@@ -427,11 +409,8 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output,
} }
const int inH = h_index * V_stride - V_pad; const int inH = h_index * V_stride - V_pad;
const int inW = w * H_stride - H_pad; //input width index (col number) const int inW = w * H_stride - H_pad; //input width index (col number)
// #pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
...@@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, ...@@ -465,38 +444,31 @@ __global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output,
} }
const int inH = h_index * V_stride - V_pad; const int inH = h_index * V_stride - V_pad;
const int inW = w * H_stride - H_pad; //input width index (col number) const int inW = w * H_stride - H_pad; //input width index (col number)
// #pragma unroll
//for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW; for(int i = 0; i < KH; i++) {
// int j = ki % KW; for(int j = 0; j < KW; j++) {
for(int i = 0; i < KH; i++) { const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
for(int j = 0; j < KW; j++) { const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w;
const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
//((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) else
output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; output[out_index] = 0;
else
output[out_index] = 0; }
} }
}
} }
} }
__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w,
__half *old_data, __half *new_data, int x, int start) { __half *old_data, __half *new_data, int x, int start) {
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int n = tx / (c * h * w); //output image number
//const int stride = blockDim.x * gridDim.x;
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int n = tx / (c * h * w); //output image number const int n = tx / (c * h * w); //output image number
if(n < N) { if(n < N) {
//for(int i = index; i < N; i += stride){
//const int col = ((i % (c * h * w)) % (h * w)) % w;
//const int row = ((i % (c * h * w)) % (h * w)) / w;
//const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const int ch = tx % (c * h * w) / (h * w); //filter number const int ch = tx % (c * h * w) / (h * w); //filter number
const int row = tx % (h * w) / w; //output height index (row number) const int row = tx % (h * w) / w; //output height index (row number)
...@@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, ...@@ -527,17 +499,9 @@ __global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h,
__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w, __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w,
__half *old_data, __half *new_data, int x, int start) { __half *old_data, __half *new_data, int x, int start) {
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int n = tx / (c * h * w); //output image numbe
//const int stride = blockDim.x * gridDim.x;
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int n = tx / (c * h * w); //output image number const int n = tx / (c * h * w); //output image number
if(n < N) { if(n < N) {
//for(int i = index; i < N; i += stride){
//const int col = ((i % (c * h * w)) % (h * w)) % w;
//const int row = ((i % (c * h * w)) % (h * w)) / w;
//const int ch = (i % (c * h * w)) / (h * w);
//const int n = i / (c * h * w);
const int ch = tx % (c * h * w) / (h * w); //filter number const int ch = tx % (c * h * w) / (h * w); //filter number
const int row = tx % (h * w) / w; //output height index (row number) const int row = tx % (h * w) / w; //output height index (row number)
...@@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, ...@@ -554,13 +518,11 @@ __global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h,
} else if((row - start) % x == 0) { } else if((row - start) % x == 0) {
const int row_index = row - ((row + 1 - start) / x); const int row_index = row - ((row + 1 - start) / x);
const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
//n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
__hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2);
} else { } else {
const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0);
const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col;
//n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col;
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
} }
} }
...@@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output, ...@@ -587,11 +549,7 @@ __global__ void convToGemmPerfColHalf(__half * const __restrict__ output,
} }
const int inW = w_index * H_stride - H_pad; const int inW = w_index * H_stride - H_pad;
const int inH = h * V_stride - V_pad; //input height index (row number) const int inH = h * V_stride - V_pad; //input height index (row number)
//#pragma unroll
// for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element
...@@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, ...@@ -626,10 +584,8 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
} }
const int inW = w_index * H_stride - H_pad; const int inW = w_index * H_stride - H_pad;
const int inH = h * V_stride - V_pad; //input height index (row number) const int inH = h * V_stride - V_pad; //input height index (row number)
//#pragma unroll
// for (int ki = 0; ki < KH * KW; ki++) {
// int i = ki / KW;
// int j = ki % KW;
for(int i = 0; i < KH; i++) { for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen
...@@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, ...@@ -647,15 +603,6 @@ __global__ void convToGemmPerfColHalf2(__half * const __restrict__ output,
__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w,
__half *old_data, __half *new_data, int x, int start) { __half *old_data, __half *new_data, int x, int start) {
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x;
//for(int i = index; i < N; i += stride){
// const int col = ((i % (c * h * w)) % (h * w)) % w;
// const int row = ((i % (c * h * w)) % (h * w)) / w;
// const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int n = tx / (c * h * w); //output image number const int n = tx / (c * h * w); //output image number
if(n < N) { if(n < N) {
...@@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, ...@@ -688,14 +635,6 @@ __global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h,
__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w,
__half *old_data, __half *new_data, int x, int start) { __half *old_data, __half *new_data, int x, int start) {
//const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x;
// for(int i = index; i < N; i += stride){
// const int col = ((i % (c * h * w)) % (h * w)) % w;
// const int row = ((i % (c * h * w)) % (h * w)) / w;
// const int ch = (i % (c * h * w)) / (h * w);
// const int n = i / (c * h * w);
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int n = tx / (c * h * w); //output image number const int n = tx / (c * h * w); //output image number
if(n < N) { if(n < N) {
...@@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, ...@@ -705,25 +644,23 @@ __global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h,
if(col < start) { if(col < start) {
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col]
= old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col];
//n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col];
} else if(col == w - 1) { } else if(col == w - 1) {
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1];
//n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1];
} else if (col == 0) { } else if (col == 0) {
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)];
//n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)];
} else if((col - start) % x == 0) { } else if((col - start) % x == 0) {
const int col_index = col - ((col + 1 - start) / x); const int col_index = col - ((col + 1 - start) / x);
const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
//n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] =
__hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2);
} else { } else {
const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0);
const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index;
//const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index;
new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index];
} }
} }
...@@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output, ...@@ -759,6 +696,7 @@ __global__ void convToGemmFullInputRegular(float * const __restrict__ output,
in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
} }
const int i = (in_index % (KW * KH)) / KW; const int i = (in_index % (KW * KH)) / KW;
const int j = in_index % KW; const int j = in_index % KW;
const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
...@@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output, ...@@ -809,13 +747,15 @@ __global__ void convToGemmFullInputIrregular(float * const __restrict__ output,
} }
} }
} }
} }
__global__ void createReducedFiltersFullRegular(float * output, __global__ void createReducedFiltersFullRegular(float * output,
const float * const __restrict input, const int NF, const float * const __restrict input, const int NF,
const int num_filter_elem, const int reduced_filter_elem, const int num_filter_elem, const int reduced_filter_elem,
const int channels, const int channels,
const int skip_every, const int skip_offset, const float fac) { const int skip_every, const int skip_offset, const float fac) {
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int fIdx = tx / reduced_filter_elem; //filter index const int fIdx = tx / reduced_filter_elem; //filter index
...@@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output, ...@@ -826,11 +766,13 @@ __global__ void createReducedFiltersFullRegular(float * output,
int in_index; int in_index;
if(offset < channel_offset) { if(offset < channel_offset) {
in_index = offset; in_index = offset;
} else { }
else {
in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
+ (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
} }
output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index];
} }
} }
...@@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, ...@@ -873,30 +815,23 @@ __global__ void convToGemmHalfInputRegular(__half * const __restrict__ output,
const int inH = h * V_stride - V_pad; //input height index (row number) const int inH = h * V_stride - V_pad; //input height index (row number)
const int inW = w * H_stride - H_pad; //input width index (col number) const int inW = w * H_stride - H_pad; //input width index (col number)
#pragma unroll #pragma unroll
//for(int fi = 0; fi < reduced_filter_elem; fi++) {
//const int ch = (fi * C) / reduced_filter_elem;
for(int ki = 0; ki < reduced_filter_elem / C; ki++) { for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
const int fi = ch * (reduced_filter_elem / C) + ki; const int fi = ch * (reduced_filter_elem / C) + ki;
const int offset = (skip_offset + ch) % skip_every; const int offset = (skip_offset + ch) % skip_every;
//int in_index;
const bool condition = (fi < offset); const bool condition = (fi < offset);
const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
//if(fi < offset) {
// in_index = fi; const int i = (in_index % (KW * KH)) / KW;
//} else { const int j = in_index % KW;
// in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
// + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
// } output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
const int i = (in_index % (KW * KH)) / KW; } else {
const int j = in_index % KW;
const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
} else {
output[out_index] = 0; output[out_index] = 0;
} }
} }
} }
} }
...@@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, ...@@ -922,26 +857,20 @@ __global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output,
#pragma unroll #pragma unroll
for(int ki = 0; ki < reduced_filter_elem / C; ki++) { for(int ki = 0; ki < reduced_filter_elem / C; ki++) {
const int fi = ch * (reduced_filter_elem / C) + ki;
//for(int fi = 0; fi < reduced_filter_elem; fi++) { const int fi = ch * (reduced_filter_elem / C) + ki;
// const int ch = (fi * C) / reduced_filter_elem;
const int offset = (skip_offset + ch) % skip_every; const int offset = (skip_offset + ch) % skip_every;
const int condition = (fi < offset); const int condition = (fi < offset);
const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1);
// int in_index;
//if(fi < offset) {
// in_index = fi;
//} else {
// in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
// }
const int i = (in_index % (KW * KH)) / KW; const int i = (in_index % (KW * KH)) / KW;
const int j = in_index % KW; const int j = in_index % KW;
const int out_index = ((fi * N + n) * H_out + h) * W_out + w; const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
} else { }
else {
output[out_index] = 0; output[out_index] = 0;
} }
} }
...@@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, ...@@ -971,20 +900,15 @@ __global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output,
const int condition = (fi < skip_offset); const int condition = (fi < skip_offset);
const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
//int in_index;
//if(fi < skip_offset) { const int ch = in_index / (KW * KH);
// in_index = fi;
//} else {
// in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
// }
const int ch = in_index / (KW * KH);
const int i = (in_index % (KW * KH)) / KW; const int i = (in_index % (KW * KH)) / KW;
const int j = in_index % KW; const int j = in_index % KW;
const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
} else { }
else {
output[out_index] = 0; output[out_index] = 0;
} }
} }
...@@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output ...@@ -1013,18 +937,11 @@ __global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output
const int condition = (fi < skip_offset); const int condition = (fi < skip_offset);
const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
// int in_index;
// if(fi < skip_offset) {
// in_index = fi;
// } else {
// in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1)
// + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1;
// }
const int ch = in_index / (KW * KH); const int ch = in_index / (KW * KH);
const int i = (in_index % (KW * KH)) / KW; const int i = (in_index % (KW * KH)) / KW;
const int j = in_index % KW; const int j = in_index % KW;
const int out_index = ((fi * N + n) * H_out + h) * W_out + w; const int out_index = ((fi * N + n) * H_out + h) * W_out + w;
//const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w;
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) {
output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)];
} else { } else {
...@@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output, ...@@ -1042,11 +959,8 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
const int skip_every, const int skip_offset, const float fac) { const int skip_every, const int skip_offset, const float fac) {
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x;
const int fIdx = tx / reduced_filter_elem; //filter index
//#pragma unroll
//for (int i = tx; i < NF; i += stride) {
const int fIdx = tx / reduced_filter_elem; //filter index
if(fIdx < NF) { if(fIdx < NF) {
const int offset = tx % reduced_filter_elem; //offset within filter const int offset = tx % reduced_filter_elem; //offset within filter
const int ch = (offset * channels) / reduced_filter_elem; const int ch = (offset * channels) / reduced_filter_elem;
...@@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output, ...@@ -1055,15 +969,9 @@ __global__ void createReducedFiltersHalfRegular(__half * output,
const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
+ (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1);
// int in_index;
// if(offset < channel_offset) {
// in_index = offset;
//} else {
// in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1)
// + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1;
// }
output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
} }
} }
__global__ void createReducedFiltersHalfIrregular(__half * output, __global__ void createReducedFiltersHalfIrregular(__half * output,
...@@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output, ...@@ -1071,21 +979,20 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
const int num_filter_elem, const int reduced_filter_elem, const int num_filter_elem, const int reduced_filter_elem,
const int skip_every, const int skip_offset, const float fac) { const int skip_every, const int skip_offset, const float fac) {
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
//const int stride = blockDim.x * gridDim.x; const int fIdx = tx / reduced_filter_elem; //filter index
//#pragma unroll
//for (int i = tx; i < NF; i += stride) {
const int fIdx = tx / reduced_filter_elem; //filter index if(fIdx < NF) {
if(fIdx < NF) {
const int offset = tx % reduced_filter_elem; //offset within filter const int offset = tx % reduced_filter_elem; //offset within filter
const int condition = (offset < skip_offset); const int condition = (offset < skip_offset);
int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1)
+ (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1);
//}
output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]);
//}
} }
} }
...@@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output, ...@@ -1112,7 +1019,7 @@ __global__ void convToGemmApprox(float * const __restrict__ output,
for(int j = 0; j < KW; j++) { for(int j = 0; j < KW; j++) {
const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
else else
...@@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, ...@@ -1130,8 +1037,6 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
int horizontal_stride, int conv_mode, int conv_groups, int horizontal_stride, int conv_mode, int conv_groups,
int row, int col, int start){ int row, int col, int start){
//////INFO("*** TensorConvolution (output perforation) \n");
//Event("Conv");
Tensor* input = (Tensor*)input_ptr; Tensor* input = (Tensor*)input_ptr;
Tensor* filter = (Tensor*)filter_ptr; Tensor* filter = (Tensor*)filter_ptr;
//FIXME: Current hack to preserve backward compatibilty //FIXME: Current hack to preserve backward compatibilty
...@@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, ...@@ -1144,10 +1049,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
hostToDeviceCopy(input); hostToDeviceCopy(input);
hostToDeviceCopy(filter); hostToDeviceCopy(filter);
//Event("H2F_start");
convertToFP32(input); convertToFP32(input);
convertToFP32(filter); convertToFP32(filter);
//Event("H2F_end");
long int n, c, h, w; // output dimensions long int n, c, h, w; // output dimensions
n = input->dims.dim_sizes[0]; n = input->dims.dim_sizes[0];
...@@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, ...@@ -1221,14 +1124,14 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
freeTensor(output); freeTensor(output);
cudaFree(convData); cudaFree(convData);
} else if(col > 1){ }
else if(col > 1){
output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
CUDNN_TENSOR_NCHW, n, c, h, w_eff); CUDNN_TENSOR_NCHW, n, c, h, w_eff);
// NOTE: Changing output tensor placement from host to device // NOTE: Changing output tensor placement from host to device
changeTensorPlacement(output, DEVICE); changeTensorPlacement(output, DEVICE);
// NOTE: Necessary to insert the above call for every output tensor
//total number of filter elem
const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
float * convData; float * convData;
...@@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ...@@ -1550,7 +1453,8 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr,
cudaFree(convData); cudaFree(convData);
cudaFree(reducedFilter); cudaFree(reducedFilter);
} else { } else {
INFO("FP32 BASELINE\n");
//INFO("FP32 BASELINE\n");
Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type,
CUDNN_TENSOR_NCHW, n, c, h, w); CUDNN_TENSOR_NCHW, n, c, h, w);
changeTensorPlacement(output, DEVICE); changeTensorPlacement(output, DEVICE);
...@@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, ...@@ -1996,14 +1900,12 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr,
freeTensor(output); freeTensor(output);
cudaFree(convData); cudaFree(convData);
} }
// INFO("CONV DONE\n");
profileEvent("H2F_start"); profileEvent("H2F_start");
convertToFP32_offline(new_output); convertToFP32_offline(new_output);
//convertToFP32(input);
//convertToFP32(filter);
profileEvent("H2F_end"); profileEvent("H2F_end");
//profileEvent("#Conv_end");
//INFO("CONVOLUTION END\n");
return new_output; return new_output;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment