diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 24f8c7d370a2922ba117eb1ec573dfc6af4742cc..95c17752fd3fd0e441f2c6335591eeb741105db9 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -38,6 +38,7 @@
 #include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
 #include "approx_api.h"
+#include "tensor_utils.h"
 
 
 void llvm_hpvm_initTensorRtCPU() {
@@ -56,6 +57,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
                                   int vertical_pad, int horizontal_pad,
                                   int vertical_stride, int horizontal_stride,
                                   int conv_mode, int compute_precision) {
+
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
@@ -135,12 +137,14 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
   return output;
 }
 
-void *tensorRegularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
+void *tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
+						int vertical_pad, int horizontal_pad,
+						int vertical_stride, int horizontal_stride,
+						int conv_mode, int compute_precision,
+						int skip_every, int start) {
+
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *filter = (Tensor *) filter_ptr;
 
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_filter = (float *)filter->host_data;
@@ -259,10 +263,12 @@ void *tensorRegularFilterSamplingConvolutionCPU(
   return output;
 }
 
-void *tensorIrregularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
+void *tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
+						  int vertical_pad, int horizontal_pad,
+						  int vertical_stride, int horizontal_stride,
+						  int conv_mode, int compute_precision,
+						  int skip_every, int start) {
+
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
@@ -665,20 +671,24 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
                           int horizontal_stride, int conv_mode,
                           int compute_precision, int row, int col,
                           int skip_every, int start) {
+
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *filter = (Tensor *) filter_ptr;
+
+  deviceToHostCopy(input);
+  deviceToHostCopy(filter);
+
   if (row > 1) {
-    //printf("ROW PERFORATION\n");
     return tensorRowPerfConvolutionCPU(
         input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
         horizontal_stride, conv_mode, compute_precision, row, start);
   }
   if (col > 1) {
-    //printf("COL PERFORATION\n");
     return tensorColPerfConvolutionCPU(
         input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
         horizontal_stride, conv_mode, compute_precision, col, start);
   }
   if (skip_every > 1) {
-    //printf("INPUT FILTERING\n");
     Tensor *filter = (Tensor *)filter_ptr;
 
     const int kernel_height = filter->dims.dim_sizes[2];
@@ -707,6 +717,9 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
+  deviceToHostCopy(input);
+  deviceToHostCopy(filter);
+  
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_filter = (float *)filter->host_data;
 
@@ -783,8 +796,12 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
 }
 
 void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
-  Tensor *x = (Tensor *)x_ptr;
-  Tensor *bias = (Tensor *)bias_ptr;
+
+  Tensor *x = (Tensor *) x_ptr;
+  Tensor *bias = (Tensor *) bias_ptr;
+
+  deviceToHostCopy(x);
+  deviceToHostCopy(bias);
 
   float *__restrict__ x_data = (float *)x->host_data;
   float *__restrict__ bias_data = (float *)bias->host_data;
@@ -793,6 +810,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
   int h = x->dims.dim_sizes[2];
   int w = x->dims.dim_sizes[3];
 
+  
   if (x->num_elems == bias->num_elems) {
     int const1 = c * h * w;
     int const2 = h * w;
@@ -824,6 +842,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
     }
   }
 
+  
   return x;
 }
 
@@ -835,6 +854,8 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
                        int vertical_stride, int horizontal_stride) {
 
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+
   float *__restrict__ input_data = (float *)input->host_data;
 
   int batch_size = input->dims.dim_sizes[0];
@@ -929,13 +950,15 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
 
+  Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;
 
   omp_set_num_threads(4);
-#pragma omp parallel for
+  #pragma omp parallel for
   for (size_t i = 0; i < num_elems; i++) {
     input_data[i] = tanhf(input_data[i]);
   }
@@ -944,9 +967,13 @@ void *tensorTanhCPU(void *input_ptr) {
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
+
   Tensor *lhs = (Tensor *)lhs_ptr;
   Tensor *rhs = (Tensor *)rhs_ptr;
 
+  deviceToHostCopy(lhs);
+  deviceToHostCopy(rhs);
+
   int m = lhs->dims.dim_sizes[0];
   int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
 
@@ -963,6 +990,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
   }
   float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
   omp_set_num_threads(4);
+
 #pragma omp parallel for simd
   for (int l = 0; l < k; l++) {
     for (int j = 0; j < n; j++) {
@@ -986,8 +1014,11 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
+
   Tensor *input = (Tensor *)input_ptr;
 
+  deviceToHostCopy(input);
+
   float *logits = (float *)input->host_data;
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
@@ -1016,12 +1047,19 @@ void *tensorSoftmaxCPU(void *input_ptr) {
 void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
                          void *mean_ptr, void *variance_ptr, double epsilon) {
 
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *gamma = (Tensor *)gamma_ptr;
-  Tensor *beta = (Tensor *)beta_ptr;
-  Tensor *mean = (Tensor *)mean_ptr;
-  Tensor *variance = (Tensor *)variance_ptr;
-
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *gamma = (Tensor *) gamma_ptr;
+  Tensor *beta = (Tensor *) beta_ptr;
+  Tensor *mean = (Tensor *) mean_ptr;
+  Tensor *variance = (Tensor *) variance_ptr;
+
+  deviceToHostCopy(input);
+  deviceToHostCopy(gamma);
+  deviceToHostCopy(beta);
+  deviceToHostCopy(mean);
+  deviceToHostCopy(variance);
+  
+  
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_beta = (float *)beta->host_data;
   float *__restrict__ host_gamma = (float *)gamma->host_data;
@@ -1055,7 +1093,10 @@ void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
 }
 
 void *tensorReluCPU(void *input_ptr) {
+
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;
 
@@ -1068,7 +1109,10 @@ void *tensorReluCPU(void *input_ptr) {
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
+
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;