From 51626467b6c3f4ad837aae971c6b0a7c80d93372 Mon Sep 17 00:00:00 2001 From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu> Date: Mon, 1 Jul 2019 19:06:16 -0500 Subject: [PATCH] Addding inline attributes for all non-tensor API functions --- .../tensor_runtime/src/tensor_cpu_runtime.cc | 680 +++++++++--------- 1 file changed, 346 insertions(+), 334 deletions(-) diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index f6e2010844..e217955b98 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -27,420 +27,432 @@ #include "../include/tensor_cpu.h" #include "../include/tensor_cpu_runtime.h" -void llvm_hpvm_initTensorRt(int gpuid) { - // NOTE: Do Nothing -} -void llvm_hpvm_cleanupTensorRt() { - // NOTE: Do Nothing -} +extern "C"{ -void hpvm_request_tensor(void *tensor, int destination) { - // NOTE: Do Nothing -} + void llvm_hpvm_initTensorRt(int gpuid) { + // NOTE: Do Nothing + } -// Returns the size of the target cudnn datatype -int getTypeSize(int data_type) { - // Float/Int data type - Full Precision - if (data_type == 0) - return 4; - // Half data type - if (data_type == 1) - return 2; + void llvm_hpvm_cleanupTensorRt() { + // NOTE: Do Nothing + } - return 1; -} + void hpvm_request_tensor(void *tensor, int destination) { + // NOTE: Do Nothing + } -void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { - int type_size = getTypeSize(data_type); - size_t size_in_bytes = type_size * num_elems; - tensor->size_in_bytes = size_in_bytes; -} + // Returns the size of the target cudnn datatype + int getTypeSize(int data_type) __attribute__((always_inline)); + inline int getTypeSize(int data_type) { + // Float/Int data type - Full Precision + if (data_type == 0) + return 4; + // Half data type + if (data_type == 1) + return 2; + + return 1; + } -void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) { - setSizeInBytes(tensor, data_type, num_elems); - tensor->data_type = data_type; - tensor->num_elems = num_elems; - tensor->host_data = + void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); + inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { + int type_size = getTypeSize(data_type); + size_t size_in_bytes = type_size * num_elems; + tensor->size_in_bytes = size_in_bytes; + } + + void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); + inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) { + setSizeInBytes(tensor, data_type, num_elems); + tensor->data_type = data_type; + tensor->num_elems = num_elems; + tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host -} + } -void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { + void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); + Tensor *tensor = (Tensor *)tensor_ptr; + if (tensor->size_in_bytes != size_in_bytes) { + printf("The destination and source sizes don't match"); + } + memcpy(tensor->host_data, data_ptr, size_in_bytes); } - memcpy(tensor->host_data, data_ptr, size_in_bytes); -} -void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size) { - struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, size_t dim4_size) __attribute__((always_inline)); + inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, size_t dim4_size) { - allocateMemCPU(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - return tensor; -} + allocateMemCPU(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; -void *tensorAddCPU(void *x_ptr, void *bias_ptr) { + return tensor; + } - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; + void *tensorAddCPU(void *x_ptr, void *bias_ptr) { - float *x_data = (float *)x->host_data; - float *bias_data = (float *)bias->host_data; + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; - int n = x->dims.dim_sizes[0]; - int c = x->dims.dim_sizes[1]; - int h = x->dims.dim_sizes[2]; - int w = x->dims.dim_sizes[3]; + float *x_data = (float *)x->host_data; + float *bias_data = (float *)bias->host_data; - size_t num_elems = x->num_elems; - size_t num_elems2 = bias->num_elems; + int n = x->dims.dim_sizes[0]; + int c = x->dims.dim_sizes[1]; + int h = x->dims.dim_sizes[2]; + int w = x->dims.dim_sizes[3]; - if (num_elems == num_elems2) { - for (size_t i = 0; i < num_elems; i++) { - x_data[i] += bias_data[i]; - } - } else { + size_t num_elems = x->num_elems; + size_t num_elems2 = bias->num_elems; - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; - } - } + if (num_elems == num_elems2) { + for (size_t i = 0; i < num_elems; i++) { + x_data[i] += bias_data[i]; + } + } else { + + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; + } + } + } } } - } - return x; -} + return x; + } -void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { + void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { - Tensor *lhs = (Tensor *)lhs_ptr; - Tensor *rhs = (Tensor *)rhs_ptr; + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; - // 'm' holds the batch dimension - assuming NCHW format Tensors - int m = lhs->dims.dim_sizes[0]; - // The rhs must be a 2D tensor - int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - int k = 1; - // Flattening the dimensions after the batch dimension - // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1; j < lhs->dims.num_dims; j++) { - k = k * lhs->dims.dim_sizes[j]; // input neurons - } + // 'm' holds the batch dimension - assuming NCHW format Tensors + int m = lhs->dims.dim_sizes[0]; + // The rhs must be a 2D tensor + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons + int k = 1; + // Flattening the dimensions after the batch dimension + // NOTE: Allowing any number of dimensions > 2 for lhs + for (int j = 1; j < lhs->dims.num_dims; j++) { + k = k * lhs->dims.dim_sizes[j]; // input neurons + } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN - // routines - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); + // NOTE: Creating a 4D tensor to be compatible with later called cuDNN + // routines + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); - float *lhs_arr = (float *)lhs->host_data; - float *rhs_arr = (float *)rhs->host_data; - float *output_arr = (float *)output->host_data; + float *lhs_arr = (float *)lhs->host_data; + float *rhs_arr = (float *)rhs->host_data; + float *output_arr = (float *)output->host_data; - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - float sum = 0.0; - for (int l = 0; l < k; l++) { - float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j]; - sum = sum + mul; + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0; + for (int l = 0; l < k; l++) { + float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j]; + sum = sum + mul; + } + output_arr[i * n + j] = sum; } - output_arr[i * n + j] = sum; } - } - return output; -} - -float power(float num, int exp) __attribute__((always_inline)); -inline float power(float num, int exp){ - bool neg = false; - if (exp < 0) { - neg = true; - exp = -1 * exp; + return output; } - float pow = 1; - for (int i = 0; i < exp; i++) { - pow = pow * num; - } + float power(float num, int exp) __attribute__((always_inline)); + inline float power(float num, int exp){ + bool neg = false; + if (exp < 0) { + neg = true; + exp = -1 * exp; + } + + float pow = 1; + for (int i = 0; i < exp; i++) { + pow = pow * num; + } - if(neg) - return 1 / pow; - else - return pow; -} + if(neg) + return 1 / pow; + else + return pow; + } -float epow(float x) __attribute__((always_inline)); -inline float epow(float x){ + float epow(float x) __attribute__((always_inline)); + inline float epow(float x){ - bool neg = false; - if (x < 0) { - x = -1 * x; - neg = true; - } + bool neg = false; + if (x < 0) { + x = -1 * x; + neg = true; + } - float sum = 0.0; - float fac = 1; + float sum = 0.0; + float fac = 1; - //whole number part - float pow = 1; - for (int i = x; i > 0; i--,x--) { - pow = pow * 2.71828; - } - //x++; + //whole number part + float pow = 1; + for (int i = x; i > 0; i--,x--) { + pow = pow * 2.71828; + } + //x++; - // First 15 terms in Taylor Series - for (int i = 0; i < 15; i++) { - sum = sum + power(x, i) / fac; - fac = fac * (i + 1); - } + // First 15 terms in Taylor Series + for (int i = 0; i < 15; i++) { + sum = sum + power(x, i) / fac; + fac = fac * (i + 1); + } - if(neg) - return 1 / (sum * pow); - else - return sum * pow; -} + if(neg) + return 1 / (sum * pow); + else + return sum * pow; + } -float tanh(float num) __attribute__((always_inline)); -inline float tanh(float num){ - float value = epow(2 * num); - value = (value - 1) / (value + 1); - return value; -} + float custom_tanh(float num) __attribute__((always_inline)); + inline float custom_tanh(float num){ + float value = epow(2 * num); + value = (value - 1) / (value + 1); + return value; + } -float max(float v1, float v2) __attribute__((always_inline)); -inline float max(float v1, float v2){ - if (v1 < v2) - return v2; - else - return v1; -} + float max(float v1, float v2) __attribute__((always_inline)); + inline float max(float v1, float v2){ + if (v1 < v2) + return v2; + else + return v1; + } -void *tensorReluCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - if (input_data[i] < 0) { - input_data[i] = 0; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + for (size_t i = 0; i < num_elems; i++) { + if (input_data[i] < 0) { + input_data[i] = 0; + } } + + return input; } - return input; -} + void *tensorTanhCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; -void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = custom_tanh(input_data[i]); + } - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = tanh(input_data[i]); + return input; } - return input; -} - -void *tensorRelu2CPU(void *input_ptr, float min, float max) { - Tensor *input = (Tensor *)input_ptr; + void *tensorRelu2CPU(void *input_ptr, float min, float max) { + Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - if (input_data[i] < min) { - input_data[i] = min; - } - if (input_data[i] > max) { - input_data[i] = max; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + for (size_t i = 0; i < num_elems; i++) { + if (input_data[i] < min) { + input_data[i] = min; + } + if (input_data[i] > max) { + input_data[i] = max; + } } - } - return input; -} + return input; + } -void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { + void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; - int output_height = + int output_height = 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / - horizontal_stride); - - int center_x = (window_width - 1) / 2 - horizontal_pad; - int center_y = (window_height - 1) / 2 - vertical_pad; - int x_radius = (window_width - 1) / 2; - int y_radius = (window_height - 1) / 2; - - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, - output_height, output_width); - float *output_data = (float *)output->host_data; - - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - float val; - if (poolFunction == 0) - val = -3.40282e+38; // assuming values never fall below min value of float - else - val = 0; - - for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) { - for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) { - if (i >= 0 && j >= 0 && i < image_height && j < image_width) { - if (poolFunction == 0) - val = max( - val, - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - i * image_width + j]); - else - val += + int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / + horizontal_stride); + + int center_x = (window_width - 1) / 2 - horizontal_pad; + int center_y = (window_height - 1) / 2 - vertical_pad; + int x_radius = (window_width - 1) / 2; + int y_radius = (window_height - 1) / 2; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, + output_height, output_width); + float *output_data = (float *)output->host_data; + + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + float val; + if (poolFunction == 0) + val = -3.40282e+38; // assuming values never fall below min value of float + else + val = 0; + + for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) { + for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) { + if (i >= 0 && j >= 0 && i < image_height && j < image_width) { + if (poolFunction == 0) + val = max( + val, + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + i * image_width + j]); + else + val += input_data[b * (channels * image_height * image_width) + ch * (image_height * image_width) + i * image_width + j]; - } - } - } - if (poolFunction == 1) - val /= window_height * window_width; - - output_data[b * (channels * output_height * output_width) + - ch * (output_height * output_width) + ii * output_width + - jj] = val; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } - } + } + } + } + if (poolFunction == 1) + val /= window_height * window_width; + + output_data[b * (channels * output_height * output_width) + + ch * (output_height * output_width) + ii * output_width + + jj] = val; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } + } + } } } + + return output; } - return output; -} + void *tensorSoftmaxCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; -void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + float *logits = (float *)input->host_data; - float *logits = (float *)input->host_data; + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; + for (int i = 0; i < n; i++) { + float x = 0; + for (int j = 0; j < c; j++) + x += epow(logits[i * c + j]); - for (int i = 0; i < n; i++) { - float x = 0; - for (int j = 0; j < c; j++) - x += epow(logits[i * c + j]); + for (int j = 0; j < c; j++) + logits[i * c + j] = epow(logits[i * c + j]) / x; + } - for (int j = 0; j < c; j++) - logits[i * c + j] = epow(logits[i * c + j]) / x; + return input; } - return input; -} - -void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision) { + void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *image = (float *)input->host_data; - float *kernels = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - - // kernel centers - int center_x = (kernel_width - 1) / 2 - horizontal_pad; - int center_y = (kernel_height - 1) / 2 - vertical_pad; - - int x_radius = (kernel_width - 1) / 2; - int y_radius = (kernel_height - 1) / 2; - int output_height = + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *image = (float *)input->host_data; + float *kernels = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + + // kernel centers + int center_x = (kernel_width - 1) / 2 - horizontal_pad; + int center_y = (kernel_height - 1) / 2 - vertical_pad; + + int x_radius = (kernel_width - 1) / 2; + int y_radius = (kernel_height - 1) / 2; + int output_height = 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / - horizontal_stride); - - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float *output_data = (float *)output->host_data; - - for (int b = 0; b < batch_size; b++) { - for (int f = 0; f < num_filters; f++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - - float sum = 0; - for (int ch = 0; ch < channels; ch++) { - for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) { - for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) { - if (i >= 0 && j >= 0 && i < image_height && j < image_width) { - sum += image[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - i * image_width + j] * - kernels[f * (channels * kernel_height * kernel_width) + - ch * (kernel_height * kernel_width) + - ki * kernel_width + kj]; - } - } - } - } - output_data[b * (num_filters * output_height * output_width) + - f * (output_height * output_width) + ii * output_width + - jj] = sum; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } - } + int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / + horizontal_stride); + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *output_data = (float *)output->host_data; + + for (int b = 0; b < batch_size; b++) { + for (int f = 0; f < num_filters; f++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + + float sum = 0; + for (int ch = 0; ch < channels; ch++) { + for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) { + for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) { + if (i >= 0 && j >= 0 && i < image_height && j < image_width) { + sum += image[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + i * image_width + j] * + kernels[f * (channels * kernel_height * kernel_width) + + ch * (kernel_height * kernel_width) + + ki * kernel_width + kj]; + } + } + } + } + output_data[b * (num_filters * output_height * output_width) + + f * (output_height * output_width) + ii * output_width + + jj] = sum; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } + } + } } } + + return output; } - return output; + } -- GitLab