Skip to content
Snippets Groups Projects
Commit 6e70be1c authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Separating group convolution functions in a separate source

parent 3b0e3f0b
No related branches found
No related tags found
No related merge requests found
......@@ -54,7 +54,7 @@ link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64 $ENV{CUDNN_PATH} $ENV{CUDNN_PATH
set(
RUNTIME_SRCS_FILENAME
approx_simulation.cu
approx_techniques.cu
group_conv.cu
approx_techniques2.cu
common.cpp
configuration.cpp
......
......@@ -1078,6 +1078,43 @@ __global__ void createReducedFiltersHalfIrregular(__half * output,
}
}
//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols
__global__ void convToGemmApprox(float * const __restrict__ output,
const float * const __restrict input, const int N, const int C,
const int H, const int W,
const int KH, const int KW, const int V_pad,
const int H_pad, const int H_out,
const int W_out, const int V_stride,
const int H_stride, const int reduced_filter_elem,
const int skip_every) {
const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id
const int n = tx / (C * H_out * W_out); //output image number
const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number
const int h = tx % (H_out * W_out) / W_out; //output height index (row number)
const int w = tx % W_out; //output width index (col number)
const int inH = h * V_stride - V_pad; //input height index (row number)
const int inW = w * H_stride - H_pad; //input width index (col number)
if(n < N) { //is thread id within bounds?
for(int i = 0; i < KH; i++) {
for(int j = 0; j < KW; j++) {
const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element
if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element?
const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account
if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W)
output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)];
else
output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0;
}
}
}
}
}
void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
int vertical_pad, int horizontal_pad, int vertical_stride,
int horizontal_stride, int conv_mode, int conv_groups,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment