From 51626467b6c3f4ad837aae971c6b0a7c80d93372 Mon Sep 17 00:00:00 2001
From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu>
Date: Mon, 1 Jul 2019 19:06:16 -0500
Subject: [PATCH] Addding inline attributes for all non-tensor API functions

---
 .../tensor_runtime/src/tensor_cpu_runtime.cc  | 680 +++++++++---------
 1 file changed, 346 insertions(+), 334 deletions(-)

diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index f6e2010844..e217955b98 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -27,420 +27,432 @@
 #include "../include/tensor_cpu.h"
 #include "../include/tensor_cpu_runtime.h"
 
-void llvm_hpvm_initTensorRt(int gpuid) {
-  // NOTE: Do Nothing
-}
 
-void llvm_hpvm_cleanupTensorRt() {
-  // NOTE: Do Nothing
-}
+extern "C"{
 
-void hpvm_request_tensor(void *tensor, int destination) {
-  // NOTE: Do Nothing
-}
+  void llvm_hpvm_initTensorRt(int gpuid) {
+    // NOTE: Do Nothing
+  }
 
-// Returns the size of the target cudnn datatype
-int getTypeSize(int data_type) {
-  // Float/Int data type - Full Precision
-  if (data_type == 0)
-    return 4;
-  // Half data type
-  if (data_type == 1)
-    return 2;
+  void llvm_hpvm_cleanupTensorRt() {
+    // NOTE: Do Nothing
+  }
 
-  return 1;
-}
+  void hpvm_request_tensor(void *tensor, int destination) {
+    // NOTE: Do Nothing
+  }
 
-void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
-  int type_size = getTypeSize(data_type);
-  size_t size_in_bytes = type_size * num_elems;
-  tensor->size_in_bytes = size_in_bytes;
-}
+  // Returns the size of the target cudnn datatype
+  int getTypeSize(int data_type) __attribute__((always_inline));
+  inline int getTypeSize(int data_type) {
+    // Float/Int data type - Full Precision
+    if (data_type == 0)
+      return 4;
+    // Half data type
+    if (data_type == 1)
+      return 2;
+
+    return 1;
+  }
 
-void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) {
-  setSizeInBytes(tensor, data_type, num_elems);
-  tensor->data_type = data_type;
-  tensor->num_elems = num_elems;
-  tensor->host_data =
+  void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
+  inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
+    int type_size = getTypeSize(data_type);
+    size_t size_in_bytes = type_size * num_elems;
+    tensor->size_in_bytes = size_in_bytes;
+  }
+
+  void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); 
+  inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) {
+    setSizeInBytes(tensor, data_type, num_elems);
+    tensor->data_type = data_type;
+    tensor->num_elems = num_elems;
+    tensor->host_data =
       (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
-}
+  }
 
-void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
+  void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
-  Tensor *tensor = (Tensor *)tensor_ptr;
-  if (tensor->size_in_bytes != size_in_bytes) {
-    printf("The destination and source sizes don't match");
+    Tensor *tensor = (Tensor *)tensor_ptr;
+    if (tensor->size_in_bytes != size_in_bytes) {
+      printf("The destination and source sizes don't match");
+    }
+    memcpy(tensor->host_data, data_ptr, size_in_bytes);
   }
-  memcpy(tensor->host_data, data_ptr, size_in_bytes);
-}
 
-void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
-                        size_t dim2_size, size_t dim3_size, size_t dim4_size) {
 
-  struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
-  size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
+			  size_t dim2_size, size_t dim3_size, size_t dim4_size) __attribute__((always_inline));
+  inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
+			  size_t dim2_size, size_t dim3_size, size_t dim4_size) {
 
-  allocateMemCPU(tensor, data_type, num_elems);
-  // Setting the tensor dimensions
-  size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
-  dim_sizes[0] = dim1_size;
-  dim_sizes[1] = dim2_size;
-  dim_sizes[2] = dim3_size;
-  dim_sizes[3] = dim4_size;
-  tensor->dims.dim_sizes = dim_sizes;
-  tensor->dims.num_dims = 4;
+    struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
+    size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
 
-  return tensor;
-}
+    allocateMemCPU(tensor, data_type, num_elems);
+    // Setting the tensor dimensions
+    size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
+    dim_sizes[0] = dim1_size;
+    dim_sizes[1] = dim2_size;
+    dim_sizes[2] = dim3_size;
+    dim_sizes[3] = dim4_size;
+    tensor->dims.dim_sizes = dim_sizes;
+    tensor->dims.num_dims = 4;
 
-void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
+    return tensor;
+  }
 
-  Tensor *x = (Tensor *)x_ptr;
-  Tensor *bias = (Tensor *)bias_ptr;
+  void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
 
-  float *x_data = (float *)x->host_data;
-  float *bias_data = (float *)bias->host_data;
+    Tensor *x = (Tensor *)x_ptr;
+    Tensor *bias = (Tensor *)bias_ptr;
 
-  int n = x->dims.dim_sizes[0];
-  int c = x->dims.dim_sizes[1];
-  int h = x->dims.dim_sizes[2];
-  int w = x->dims.dim_sizes[3];
+    float *x_data = (float *)x->host_data;
+    float *bias_data = (float *)bias->host_data;
 
-  size_t num_elems = x->num_elems;
-  size_t num_elems2 = bias->num_elems;
+    int n = x->dims.dim_sizes[0];
+    int c = x->dims.dim_sizes[1];
+    int h = x->dims.dim_sizes[2];
+    int w = x->dims.dim_sizes[3];
 
-  if (num_elems == num_elems2) {
-    for (size_t i = 0; i < num_elems; i++) {
-      x_data[i] += bias_data[i];
-    }
-  } else {
+    size_t num_elems = x->num_elems;
+    size_t num_elems2 = bias->num_elems;
 
-    for (int i = 0; i < n; i++) {
-      for (int j = 0; j < c; j++) {
-        for (int k = 0; k < h; k++) {
-          for (int l = 0; l < w; l++) {
-            x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
-          }
-        }
+    if (num_elems == num_elems2) {
+      for (size_t i = 0; i < num_elems; i++) {
+	x_data[i] += bias_data[i];
+      }
+    } else {
+
+      for (int i = 0; i < n; i++) {
+	for (int j = 0; j < c; j++) {
+	  for (int k = 0; k < h; k++) {
+	    for (int l = 0; l < w; l++) {
+	      x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
+	    }
+	  }
+	}
       }
     }
-  }
 
-  return x;
-}
+    return x;
+  }
 
-void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
+  void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
 
-  Tensor *lhs = (Tensor *)lhs_ptr;
-  Tensor *rhs = (Tensor *)rhs_ptr;
+    Tensor *lhs = (Tensor *)lhs_ptr;
+    Tensor *rhs = (Tensor *)rhs_ptr;
 
-  // 'm' holds the batch dimension - assuming NCHW format Tensors
-  int m = lhs->dims.dim_sizes[0];
-  // The rhs must be a 2D tensor
-  int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
-  int k = 1;
-  // Flattening the dimensions after the batch dimension
-  // NOTE: Allowing any number of dimensions > 2 for lhs
-  for (int j = 1; j < lhs->dims.num_dims; j++) {
-    k = k * lhs->dims.dim_sizes[j]; // input neurons
-  }
+    // 'm' holds the batch dimension - assuming NCHW format Tensors
+    int m = lhs->dims.dim_sizes[0];
+    // The rhs must be a 2D tensor
+    int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
+    int k = 1;
+    // Flattening the dimensions after the batch dimension
+    // NOTE: Allowing any number of dimensions > 2 for lhs
+    for (int j = 1; j < lhs->dims.num_dims; j++) {
+      k = k * lhs->dims.dim_sizes[j]; // input neurons
+    }
 
-  int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
+    int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
 
-  // NOTE: Creating a 4D tensor to be compatible with later called cuDNN
-  // routines
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
+    // NOTE: Creating a 4D tensor to be compatible with later called cuDNN
+    // routines
+    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
 
-  float *lhs_arr = (float *)lhs->host_data;
-  float *rhs_arr = (float *)rhs->host_data;
-  float *output_arr = (float *)output->host_data;
+    float *lhs_arr = (float *)lhs->host_data;
+    float *rhs_arr = (float *)rhs->host_data;
+    float *output_arr = (float *)output->host_data;
 
-  for (int i = 0; i < m; i++) {
-    for (int j = 0; j < n; j++) {
-      float sum = 0.0;
-      for (int l = 0; l < k; l++) {
-        float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j];
-        sum = sum + mul;
+    for (int i = 0; i < m; i++) {
+      for (int j = 0; j < n; j++) {
+	float sum = 0.0;
+	for (int l = 0; l < k; l++) {
+	  float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j];
+	  sum = sum + mul;
+	}
+	output_arr[i * n + j] = sum;
       }
-      output_arr[i * n + j] = sum;
     }
-  }
 
-  return output;
-}
-
-float power(float num, int exp) __attribute__((always_inline));
-inline float power(float num, int exp){
-  bool neg = false; 
-  if (exp < 0) {
-    neg = true;
-    exp = -1 * exp;
+    return output;
   }
 
-  float pow = 1;
-  for (int i = 0; i < exp; i++) {
-    pow = pow * num;
-  }
+  float power(float num, int exp) __attribute__((always_inline));
+  inline float power(float num, int exp){
+    bool neg = false; 
+    if (exp < 0) {
+      neg = true;
+      exp = -1 * exp;
+    }
+
+    float pow = 1;
+    for (int i = 0; i < exp; i++) {
+      pow = pow * num;
+    }
   
-  if(neg)
-    return 1 / pow;
-  else
-    return pow;
-}
+    if(neg)
+      return 1 / pow;
+    else
+      return pow;
+  }
 
-float epow(float x) __attribute__((always_inline));
-inline float epow(float x){
+  float epow(float x) __attribute__((always_inline));
+  inline float epow(float x){
 
-  bool neg = false;
-  if (x < 0) {
-    x = -1 * x;
-    neg = true;
-  }
+    bool neg = false;
+    if (x < 0) {
+      x = -1 * x;
+      neg = true;
+    }
 
-  float sum = 0.0;
-  float fac = 1;
+    float sum = 0.0;
+    float fac = 1;
 
-  //whole number part
-  float pow = 1;
-  for (int i = x; i > 0; i--,x--) {
-    pow = pow * 2.71828;
-  }
-  //x++;
+    //whole number part
+    float pow = 1;
+    for (int i = x; i > 0; i--,x--) {
+      pow = pow * 2.71828;
+    }
+    //x++;
   
-  // First 15 terms in Taylor Series
-  for (int i = 0; i < 15; i++) {
-    sum = sum + power(x, i) / fac;
-    fac = fac * (i + 1);
-  }
+    // First 15 terms in Taylor Series
+    for (int i = 0; i < 15; i++) {
+      sum = sum + power(x, i) / fac;
+      fac = fac * (i + 1);
+    }
 
-  if(neg)
-    return 1 / (sum * pow);
-  else
-    return sum * pow;
-}
+    if(neg)
+      return 1 / (sum * pow);
+    else
+      return sum * pow;
+  }
 
-float tanh(float num) __attribute__((always_inline));
-inline float tanh(float num){
-  float value = epow(2 * num);
-  value = (value - 1) / (value + 1);
-  return value;
-}
+  float custom_tanh(float num) __attribute__((always_inline));
+  inline float custom_tanh(float num){
+    float value = epow(2 * num);
+    value = (value - 1) / (value + 1);
+    return value;
+  }
 
-float max(float v1, float v2) __attribute__((always_inline));
-inline float max(float v1, float v2){
-  if (v1 < v2)
-    return v2;
-  else
-    return v1;
-}
+  float max(float v1, float v2) __attribute__((always_inline));
+  inline float max(float v1, float v2){
+    if (v1 < v2)
+      return v2;
+    else
+      return v1;
+  }
 
-void *tensorReluCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
+  void *tensorReluCPU(void *input_ptr) {
+    Tensor *input = (Tensor *)input_ptr;
 
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-  for (size_t i = 0; i < num_elems; i++) {
-    if (input_data[i] < 0) {
-      input_data[i] = 0;
+    float *input_data = (float *)input->host_data;
+    size_t num_elems = input->num_elems;
+    for (size_t i = 0; i < num_elems; i++) {
+      if (input_data[i] < 0) {
+	input_data[i] = 0;
+      }
     }
+
+    return input;
   }
 
-  return input;
-}
+  void *tensorTanhCPU(void *input_ptr) {
+    Tensor *input = (Tensor *)input_ptr;
 
-void *tensorTanhCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
+    float *input_data = (float *)input->host_data;
+    size_t num_elems = input->num_elems;
+    for (size_t i = 0; i < num_elems; i++) {
+      input_data[i] = custom_tanh(input_data[i]);
+    }
 
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-  for (size_t i = 0; i < num_elems; i++) {
-    input_data[i] = tanh(input_data[i]);
+    return input;
   }
 
-  return input;
-}
-
-void *tensorRelu2CPU(void *input_ptr, float min, float max) {
-  Tensor *input = (Tensor *)input_ptr;
+  void *tensorRelu2CPU(void *input_ptr, float min, float max) {
+    Tensor *input = (Tensor *)input_ptr;
 
-  float *input_data = (float *)input->host_data;
-  size_t num_elems = input->num_elems;
-  for (size_t i = 0; i < num_elems; i++) {
-    if (input_data[i] < min) {
-      input_data[i] = min;
-    }
-    if (input_data[i] > max) {
-      input_data[i] = max;
+    float *input_data = (float *)input->host_data;
+    size_t num_elems = input->num_elems;
+    for (size_t i = 0; i < num_elems; i++) {
+      if (input_data[i] < min) {
+	input_data[i] = min;
+      }
+      if (input_data[i] > max) {
+	input_data[i] = max;
+      }
     }
-  }
 
-  return input;
-}
+    return input;
+  }
 
-void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
-                       int window_width, int vertical_pad, int horizontal_pad,
-                       int vertical_stride, int horizontal_stride) {
+  void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
+			 int window_width, int vertical_pad, int horizontal_pad,
+			 int vertical_stride, int horizontal_stride) {
  
-  Tensor *input = (Tensor *)input_ptr;
-  float *input_data = (float *)input->host_data;
+    Tensor *input = (Tensor *)input_ptr;
+    float *input_data = (float *)input->host_data;
 
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
 
-  int output_height =
+    int output_height =
       1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
-  int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
-                          horizontal_stride);
-
-  int center_x = (window_width - 1) / 2 - horizontal_pad;
-  int center_y = (window_height - 1) / 2 - vertical_pad;
-  int x_radius = (window_width - 1) / 2;
-  int y_radius = (window_height - 1) / 2;
-
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
-                                               output_height, output_width);
-  float *output_data = (float *)output->host_data;
-
-  for (int b = 0; b < batch_size; b++) {
-    for (int ch = 0; ch < channels; ch++) {
-      int ii = 0, jj = 0;
-      for (int r = center_y; r < image_height + vertical_pad - y_radius;
-           r += vertical_stride) {
-        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
-             c += horizontal_stride) {
-          float val;
-          if (poolFunction == 0)
-            val = -3.40282e+38; // assuming values never fall below min value of float
-          else
-            val = 0;
-
-          for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) {
-            for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) {
-              if (i >= 0 && j >= 0 && i < image_height && j < image_width) {
-                if (poolFunction == 0)
-                  val = max(
-                      val,
-                      input_data[b * (channels * image_height * image_width) +
-                                 ch * (image_height * image_width) +
-                                 i * image_width + j]);
-                else
-                  val +=
+    int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) /
+			    horizontal_stride);
+
+    int center_x = (window_width - 1) / 2 - horizontal_pad;
+    int center_y = (window_height - 1) / 2 - vertical_pad;
+    int x_radius = (window_width - 1) / 2;
+    int y_radius = (window_height - 1) / 2;
+
+    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
+						 output_height, output_width);
+    float *output_data = (float *)output->host_data;
+
+    for (int b = 0; b < batch_size; b++) {
+      for (int ch = 0; ch < channels; ch++) {
+	int ii = 0, jj = 0;
+	for (int r = center_y; r < image_height + vertical_pad - y_radius;
+	     r += vertical_stride) {
+	  for (int c = center_x; c < image_width + horizontal_pad - x_radius;
+	       c += horizontal_stride) {
+	    float val;
+	    if (poolFunction == 0)
+	      val = -3.40282e+38; // assuming values never fall below min value of float
+	    else
+	      val = 0;
+
+	    for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) {
+	      for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) {
+		if (i >= 0 && j >= 0 && i < image_height && j < image_width) {
+		  if (poolFunction == 0)
+		    val = max(
+			      val,
+			      input_data[b * (channels * image_height * image_width) +
+					 ch * (image_height * image_width) +
+					 i * image_width + j]);
+		  else
+		    val +=
                       input_data[b * (channels * image_height * image_width) +
                                  ch * (image_height * image_width) +
                                  i * image_width + j];
-              }
-            }
-          }
-          if (poolFunction == 1)
-            val /= window_height * window_width;
-
-          output_data[b * (channels * output_height * output_width) +
-                      ch * (output_height * output_width) + ii * output_width +
-                      jj] = val;
-          jj++;
-          if (jj == output_width) {
-            jj = 0;
-            ii++;
-          }
-        }
+		}
+	      }
+	    }
+	    if (poolFunction == 1)
+	      val /= window_height * window_width;
+
+	    output_data[b * (channels * output_height * output_width) +
+			ch * (output_height * output_width) + ii * output_width +
+			jj] = val;
+	    jj++;
+	    if (jj == output_width) {
+	      jj = 0;
+	      ii++;
+	    }
+	  }
+	}
       }
     }
+
+    return output;
   }
 
-  return output;
-}
+  void *tensorSoftmaxCPU(void *input_ptr) {
+    Tensor *input = (Tensor *)input_ptr;
 
-void *tensorSoftmaxCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
+    float *logits = (float *)input->host_data;
 
-  float *logits = (float *)input->host_data;
+    int n = input->dims.dim_sizes[0];
+    int c = input->dims.dim_sizes[1];
 
-  int n = input->dims.dim_sizes[0];
-  int c = input->dims.dim_sizes[1];
+    for (int i = 0; i < n; i++) {
+      float x = 0;
+      for (int j = 0; j < c; j++)
+	x += epow(logits[i * c + j]);
 
-  for (int i = 0; i < n; i++) {
-    float x = 0;
-    for (int j = 0; j < c; j++)
-      x += epow(logits[i * c + j]);
+      for (int j = 0; j < c; j++)
+	logits[i * c + j] = epow(logits[i * c + j]) / x;
+    }
 
-    for (int j = 0; j < c; j++)
-      logits[i * c + j] = epow(logits[i * c + j]) / x;
+    return input;
   }
 
-  return input;
-}
-
-void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
-                           int horizontal_pad, int vertical_stride,
-                           int horizontal_stride, int conv_mode,
-                           int compute_precision) {
+  void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
+			     int horizontal_pad, int vertical_stride,
+			     int horizontal_stride, int conv_mode,
+			     int compute_precision) {
  
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
-
-  float *image = (float *)input->host_data;
-  float *kernels = (float *)filter->host_data;
-
-  int batch_size = input->dims.dim_sizes[0];
-  int channels = input->dims.dim_sizes[1];
-  int image_height = input->dims.dim_sizes[2];
-  int image_width = input->dims.dim_sizes[3];
-  int num_filters = filter->dims.dim_sizes[0];
-  int kernel_height = filter->dims.dim_sizes[2];
-  int kernel_width = filter->dims.dim_sizes[3];
-
-  // kernel centers
-  int center_x = (kernel_width - 1) / 2 - horizontal_pad;
-  int center_y = (kernel_height - 1) / 2 - vertical_pad;
-
-  int x_radius = (kernel_width - 1) / 2;
-  int y_radius = (kernel_height - 1) / 2;
-  int output_height =
+    Tensor *input = (Tensor *)input_ptr;
+    Tensor *filter = (Tensor *)filter_ptr;
+
+    float *image = (float *)input->host_data;
+    float *kernels = (float *)filter->host_data;
+
+    int batch_size = input->dims.dim_sizes[0];
+    int channels = input->dims.dim_sizes[1];
+    int image_height = input->dims.dim_sizes[2];
+    int image_width = input->dims.dim_sizes[3];
+    int num_filters = filter->dims.dim_sizes[0];
+    int kernel_height = filter->dims.dim_sizes[2];
+    int kernel_width = filter->dims.dim_sizes[3];
+
+    // kernel centers
+    int center_x = (kernel_width - 1) / 2 - horizontal_pad;
+    int center_y = (kernel_height - 1) / 2 - vertical_pad;
+
+    int x_radius = (kernel_width - 1) / 2;
+    int y_radius = (kernel_height - 1) / 2;
+    int output_height =
       1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
-  int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
-                          horizontal_stride);
-
-  Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
-                                               output_height, output_width);
-  float *output_data = (float *)output->host_data;
-
-  for (int b = 0; b < batch_size; b++) {
-    for (int f = 0; f < num_filters; f++) {
-      int ii = 0, jj = 0;
-      for (int r = center_y; r < image_height + vertical_pad - y_radius;
-           r += vertical_stride) {
-        for (int c = center_x; c < image_width + horizontal_pad - x_radius;
-             c += horizontal_stride) {
-
-          float sum = 0;
-          for (int ch = 0; ch < channels; ch++) {
-            for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) {
-              for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) {
-                if (i >= 0 && j >= 0 && i < image_height && j < image_width) {
-                  sum += image[b * (channels * image_height * image_width) +
-                               ch * (image_height * image_width) +
-                               i * image_width + j] *
-                         kernels[f * (channels * kernel_height * kernel_width) +
-                                 ch * (kernel_height * kernel_width) +
-                                 ki * kernel_width + kj];
-                }
-              }
-            }
-          }
-          output_data[b * (num_filters * output_height * output_width) +
-                      f * (output_height * output_width) + ii * output_width +
-                      jj] = sum;
-          jj++;
-          if (jj == output_width) {
-            jj = 0;
-            ii++;
-          }
-        }
+    int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
+			    horizontal_stride);
+
+    Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
+						 output_height, output_width);
+    float *output_data = (float *)output->host_data;
+
+    for (int b = 0; b < batch_size; b++) {
+      for (int f = 0; f < num_filters; f++) {
+	int ii = 0, jj = 0;
+	for (int r = center_y; r < image_height + vertical_pad - y_radius;
+	     r += vertical_stride) {
+	  for (int c = center_x; c < image_width + horizontal_pad - x_radius;
+	       c += horizontal_stride) {
+
+	    float sum = 0;
+	    for (int ch = 0; ch < channels; ch++) {
+	      for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) {
+		for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) {
+		  if (i >= 0 && j >= 0 && i < image_height && j < image_width) {
+		    sum += image[b * (channels * image_height * image_width) +
+				 ch * (image_height * image_width) +
+				 i * image_width + j] *
+		      kernels[f * (channels * kernel_height * kernel_width) +
+			      ch * (kernel_height * kernel_width) +
+			      ki * kernel_width + kj];
+		  }
+		}
+	      }
+	    }
+	    output_data[b * (num_filters * output_height * output_width) +
+			f * (output_height * output_width) + ii * output_width +
+			jj] = sum;
+	    jj++;
+	    if (jj == output_width) {
+	      jj = 0;
+	      ii++;
+	    }
+	  }
+	}
       }
     }
+
+    return output;
   }
 
-  return output;
+
 }
-- 
GitLab