diff --git a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
index c0dbd3899b0f6f31e0cb3d58a501aef7771b8281..1aec81d69ba89167489b3d7be161e8baa79b3bb0 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
@@ -1243,8 +1243,7 @@ private:
   // HPVM Runtime API and Tensor runtime API
   FunctionCallee llvm_hpvm_initApproxhpvmRt;
   FunctionCallee llvm_hpvm_cleanupApproxhpvmRt;
-  FunctionCallee hpvm_request_tensor;
-
+ 
   FunctionCallee llvm_hpvm_initializeRuntimeController;
   FunctionCallee llvm_hpvm_clearRuntimeController;
 
@@ -1286,11 +1285,9 @@ void CGT_WrapperAPI::initRuntimeAPI() {
   // Get or insert Global declarations for
   // - initialization
   // - cleanup
-  // - request a tensor
   DECLARE(llvm_hpvm_initApproxhpvmRt);
   DECLARE(llvm_hpvm_cleanupApproxhpvmRt);
-  DECLARE(hpvm_request_tensor);
-
+ 
   DECLARE(llvm_hpvm_initializeRuntimeController);
   DECLARE(llvm_hpvm_clearRuntimeController);
 
@@ -1403,15 +1400,6 @@ void CGT_WrapperAPI::codeGen(DFLeafNode *N) {
   ConstantInt *TargetDeviceID =
       ConstantInt::get(Type::getInt32Ty(M.getContext()), 1);
 
-  for (Function::arg_iterator ai = F_wrapper_api->arg_begin(),
-                              ae = F_wrapper_api->arg_end();
-       ai != ae; ++ai) {
-    Argument *Arg = &*ai;
-    if (Arg->getType()->isPointerTy()) {
-      Value *Args[] = {Arg, TargetDeviceID};
-      CallInst::Create(hpvm_request_tensor, ArrayRef<Value *>(Args, 2), "", FI);
-    }
-  }
 
   CodeGenStateMachine CGM(&M, runtimeModule.get());
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
index c318a8fb6aba604282cf709d09b6a6ef1a771f0e..8dfa287afc0b33c654931d038d6743241f72a5ed 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h
@@ -111,9 +111,10 @@ void *handleTensorConvApproximationTuples_CPU(
     }
     case CPUNodeConfiguration::APPROX::PERFORATION: {
       PerfParams params = perfParamSet->getPerfParams(param);
-      INFO("perforation param = %i\n", param);
-      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-           params.row, params.col, params.skip_offset);
+
+      DEBUG("perforation param = %i\n", param);
+      DEBUG("params.row = %i, params.col = %i, params.skip_offset = %i\n", params.row, params.col, params.skip_offset);
+      
       void *t_out;
       RC->resume_profiler();
       t_out = tensorConvApproxCPU(
@@ -131,9 +132,10 @@ void *handleTensorConvApproximationTuples_CPU(
     }
     case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: {
       SampParams params = sampParamSet->getSampParams(param);
-      INFO("sampling param = %i\n", param);
-      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
-           params.skip_offset);
+
+      DEBUG("sampling param = %i\n", param);
+      DEBUG("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, params.skip_offset);
+      
       void *t_out;
       RC->resume_profiler();
       t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w,
@@ -536,9 +538,10 @@ void *handleTensorConvApproximationTuples(
     case GPUNodeConfiguration::APPROX::PERFORATION:
     case GPUNodeConfiguration::APPROX::PERFORATION_HP: {
       PerfParams params = perfParamSet->getPerfParams(param);
-      INFO("perforation param = %i\n", param);
-      INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n",
-           params.row, params.col, params.skip_offset);
+
+      DEBUG("perforation param = %i\n", param);
+      DEBUG("params.row = %i, params.col = %i, params.skip_offset = %i\n", params.row, params.col, params.skip_offset);
+      
       void *t_out;
       RC->resume_profiler();
       t_out = tensorConvApproxHalf2(
@@ -557,9 +560,11 @@ void *handleTensorConvApproximationTuples(
     case GPUNodeConfiguration::APPROX::INPUT_SAMPLING:
     case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: {
       SampParams params = sampParamSet->getSampParams(param);
-      INFO("sampling param = %i\n", param);
-      INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
+      
+      DEBUG("sampling param = %i\n", param);
+      DEBUG("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate,
            params.skip_offset);
+      
       void *t_out;
       RC->resume_profiler();
       t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w,
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
index 6e81c7a3fbfbe4cae3cd1c40f43a4c7d5ea2d7c8..4d89d38ad193164027c2e7fde78764df3cdd7a92 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
@@ -33,14 +33,11 @@ struct Tensor {
   int data_type;
   int cur_type;
   int data_format;
-  data_location_t
-      data_placement; // Maintains the location of the tensor {host, device...}
-  cudnnTensorDescriptor_t tensor_desc;
-  cudnnFilterDescriptor_t
-      filter_desc; // FIXIT: Rethink if this should be in tensor struct
-  cudnnTensorDescriptor_t tensor_half_desc;
-  cudnnFilterDescriptor_t
-      filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
+  data_location_t  data_placement; // Maintains the location of the tensor {host, device...}
+  cudnnTensorDescriptor_t  tensor_desc;
+  cudnnFilterDescriptor_t  filter_desc; // FIXIT: Rethink if this should be in tensor struct
+  cudnnTensorDescriptor_t  tensor_half_desc;
+  cudnnFilterDescriptor_t  filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
   void *host_data;
   void *gpu_data;       // Pointer to GPU FP32 data
   void *gpu_half_data;  // Pointer to GPU FP16 data
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
index d4961d19b9326daa4571d066dfe2b3177f6a78d4..698ab026dac0324cb456f69b4398111ac708412d 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
@@ -43,13 +43,13 @@ void dumpWeightsToFile(char *file_name, void *weights_ptr) {
     abort();
   }
 
-  // printf("size_in_bytes = %lu \n", weights->size_in_bytes);
   size_t bytes_written =
       fwrite(weights->host_data, 1, weights->size_in_bytes, fp);
-  // printf("bytes_written = %lu \n", bytes_written);
+
   fclose(fp);
 }
 
+
 void fillTensorWithOnes(void *tensor_ptr) {
 
   struct Tensor *tensor = (struct Tensor *)tensor_ptr;
@@ -201,8 +201,7 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   long int size_in_bytes =
       type_size * dim1_size * dim2_size * dim3_size * dim4_size;
   float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
-  printf("size_in_bytes  = %lu \n", size_in_bytes);
-
+ 
   int file_header_size = 0;
 
   FILE *file = fopen(file_name, "rb");
@@ -214,21 +213,19 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   fseek(file, file_header_size, SEEK_CUR); // Skipping the file header
   size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
 
-  // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes,
-  // bytes_read);
-
   fclose(file);
 
   struct Tensor *weights = (struct Tensor *)create4DTensor(
       data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
   initTensorData(weights, tensor_data, size_in_bytes);
-  // compareValues(weights, tensor_data, num_elems);
+
   free(tensor_data);
 
   return weights;
 }
 
+
 struct Tensor *readInputBatch(const char *file_name, long data_type,
 			      long start, long end,
 			      long dim2_size, long dim3_size, long dim4_size) {
@@ -254,10 +251,8 @@ struct Tensor *readInputBatch(const char *file_name, long data_type,
 
   fclose(file);
 
-  // printf ("FIXED input BATCH read \n");
 
-  struct Tensor *weights = (struct Tensor *)create4DTensor(
-      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+  struct Tensor *weights = (struct Tensor *) create4DTensor(data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
   initTensorData(weights, tensor_data, size_in_bytes);
   free(tensor_data);
@@ -593,3 +588,4 @@ void dumpOutput(void *output_ptr, const char *file_name) {
 }
 
 #endif
+
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
index 6a3fcc12e014205aaf81e2cae0906ed6cfbff33e..e42b2cbc06d0c51050f8fa858aae349f35e7fa71 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu
@@ -218,7 +218,7 @@ void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
                         int horizontal_pad, int vertical_stride,
                         int horizontal_stride, int conv_mode, int conv_groups) {
 
-  INFO("*** TensorConvolution \n");
+  //INFO("*** TensorGroupConvolution \n");
   profileEvent("Conv");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -290,7 +290,7 @@ void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
     hostToDeviceCopy(input);
     hostToDeviceCopy(filter);
 
-    INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
+    DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride,
          horizontal_stride);
 
     checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
@@ -386,7 +386,7 @@ void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad,
                             int horizontal_stride, int conv_mode,
                             int conv_groups) {
 
-  INFO("*** TensorHConvolution \n");
+  DEBUG("*** TensorHConvolution \n");
   profileEvent("#Conv");
 
   Tensor *input = (Tensor *)input_ptr;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 0332313c573bcd28215a4277cd788e63a7820b2a..2d0ba3288b6e440046655a0acef9454b335ce55c 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -688,7 +688,6 @@ void RuntimeController::readConfigurationFile(const char *str) {
       firstTensorID += NodeConf->getApproxChoices().size();
 
     } else if (tokens[1] == "cpu") {
-      INFO("---------Found cpu configuration\n");
 
       // There must be at least one operation, with an approximation option
       CUSTOM_ASSERT((tokens.size() >= 5) &&
@@ -700,8 +699,7 @@ void RuntimeController::readConfigurationFile(const char *str) {
 
       InitialConfigurations.back().idConfigMap.insert(
           std::make_pair(firstTensorID, NodeConf));
-      INFO("*** firstTensorID = %d \n\n", firstTensorID);
-      INFO("FIXED CPU ID ISSUE\n");
+
       unsigned idx = 2;
       while (idx < tokens.size()) {
         if (tokens[idx] == "add") {
@@ -1193,7 +1191,7 @@ static int num_executations = 0;
 
 float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
-  struct Tensor *result = (struct Tensor *)result_ptr;
+  struct Tensor *result = (struct Tensor *) result_ptr;
 
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t num_classes = result->dims.dim_sizes[1];
@@ -1206,6 +1204,7 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
     int chosen = 0;
     for (int id = 1; id < num_classes; ++id) {
+      //printf(" check = %f \n ",  data[i * num_classes + id]);
       if (data[i * num_classes + chosen] < data[i * num_classes + id])
         chosen = id;
     }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 89148bd5e05e350c29ac49c2eed0a5b93696d38a..700793529f1a7e1e9d8c887c28e7aefbd9afba93 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -38,6 +38,7 @@
 #include "tensor_runtime.h"
 #include "tensor_cpu_runtime.h"
 #include "approx_api.h"
+#include "tensor_utils.h"
 
 
 void llvm_hpvm_initTensorRtCPU() {
@@ -56,6 +57,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
                                   int vertical_pad, int horizontal_pad,
                                   int vertical_stride, int horizontal_stride,
                                   int conv_mode, int compute_precision) {
+
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
@@ -75,17 +77,15 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
                           horizontal_stride);
   int num_filter_elem = kernel_height * kernel_width * channels;
   int output_size = output_width * output_height;
-  printf("--CREATE 4D TENSOR\n");
+ 
   Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
                                                output_height, output_width);
   float *__restrict__ output_data = (float *)output->host_data;
-  printf("CREATED 4D TENSOR\n");
+ 
   long int conv_data_size = sizeof(float) * num_filter_elem * output_height *
                             output_width * batch_size;
   float *host_data = (float *)malloc(conv_data_size);
-  printf("host data: %p\n", host_data);
-  printf("conv_data_size: %d\n", conv_data_size);
-  printf("number of batches: %d\n", batch_size);
+
   omp_set_num_threads(4);
 #pragma omp parallel for
   for (int b = 0; b < batch_size; b++) {
@@ -131,17 +131,20 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
       }
     }
   }
+  
   free(host_data);
-  printf("END: %p\n", output);
+ 
   return output;
 }
 
-void *tensorRegularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *filter = (Tensor *)filter_ptr;
+void *tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
+						int vertical_pad, int horizontal_pad,
+						int vertical_stride, int horizontal_stride,
+						int conv_mode, int compute_precision,
+						int skip_every, int start) {
+
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *filter = (Tensor *) filter_ptr;
 
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_filter = (float *)filter->host_data;
@@ -260,10 +263,12 @@ void *tensorRegularFilterSamplingConvolutionCPU(
   return output;
 }
 
-void *tensorIrregularFilterSamplingConvolutionCPU(
-    void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
-    int vertical_stride, int horizontal_stride, int conv_mode,
-    int compute_precision, int skip_every, int start) {
+void *tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
+						  int vertical_pad, int horizontal_pad,
+						  int vertical_stride, int horizontal_stride,
+						  int conv_mode, int compute_precision,
+						  int skip_every, int start) {
+
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
@@ -666,20 +671,24 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
                           int horizontal_stride, int conv_mode,
                           int compute_precision, int row, int col,
                           int skip_every, int start) {
+
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *filter = (Tensor *) filter_ptr;
+
+  deviceToHostCopy(input);
+  deviceToHostCopy(filter);
+
   if (row > 1) {
-    printf("ROW PERFORATION\n");
     return tensorRowPerfConvolutionCPU(
         input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
         horizontal_stride, conv_mode, compute_precision, row, start);
   }
   if (col > 1) {
-    printf("COL PERFORATION\n");
     return tensorColPerfConvolutionCPU(
         input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
         horizontal_stride, conv_mode, compute_precision, col, start);
   }
   if (skip_every > 1) {
-    printf("INPUT FILTERING\n");
     Tensor *filter = (Tensor *)filter_ptr;
 
     const int kernel_height = filter->dims.dim_sizes[2];
@@ -694,7 +703,7 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
         input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
         horizontal_stride, conv_mode, compute_precision, skip_every, start);
   }
-  printf("---REGULAR CONV\n");
+  
   return tensorRegularConvolutionCPU(
       input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
       horizontal_stride, conv_mode, compute_precision);
@@ -708,6 +717,9 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
 
+  deviceToHostCopy(input);
+  deviceToHostCopy(filter);
+  
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_filter = (float *)filter->host_data;
 
@@ -784,8 +796,12 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
 }
 
 void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
-  Tensor *x = (Tensor *)x_ptr;
-  Tensor *bias = (Tensor *)bias_ptr;
+
+  Tensor *x = (Tensor *) x_ptr;
+  Tensor *bias = (Tensor *) bias_ptr;
+
+  deviceToHostCopy(x);
+  deviceToHostCopy(bias);
 
   float *__restrict__ x_data = (float *)x->host_data;
   float *__restrict__ bias_data = (float *)bias->host_data;
@@ -794,6 +810,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
   int h = x->dims.dim_sizes[2];
   int w = x->dims.dim_sizes[3];
 
+  
   if (x->num_elems == bias->num_elems) {
     int const1 = c * h * w;
     int const2 = h * w;
@@ -825,6 +842,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
     }
   }
 
+  
   return x;
 }
 
@@ -836,6 +854,8 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
                        int vertical_stride, int horizontal_stride) {
 
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+
   float *__restrict__ input_data = (float *)input->host_data;
 
   int batch_size = input->dims.dim_sizes[0];
@@ -853,8 +873,9 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
   int x_radius = (window_width - 1) / 2;
   int y_radius = (window_height - 1) / 2;
 
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
+  Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels,
                                                output_height, output_width);
+  
   float *__restrict__ output_data = (float *)output->host_data;
 
   omp_set_num_threads(4);
@@ -871,10 +892,10 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
           int y_radius_var_max = y_radius_var + image_height;
           int x_radius_var = x_radius - c;
           int x_radius_var_max = x_radius_var + image_width;
-          int ki_min =
-              (y_radius_var > 0)
-                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
-                  : 0;
+          int ki_min = (y_radius_var > 0)
+	    ? ((y_radius_var < window_height) ? y_radius_var : -1)
+	    : 0;
+	  
           int ki_max = (y_radius_var_max < window_height)
                            ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
                            : window_height;
@@ -930,13 +951,15 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
 }
 
 void *tensorTanhCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
 
+  Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;
 
   omp_set_num_threads(4);
-#pragma omp parallel for
+  #pragma omp parallel for
   for (size_t i = 0; i < num_elems; i++) {
     input_data[i] = tanhf(input_data[i]);
   }
@@ -945,17 +968,21 @@ void *tensorTanhCPU(void *input_ptr) {
 }
 
 void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
+
   Tensor *lhs = (Tensor *)lhs_ptr;
   Tensor *rhs = (Tensor *)rhs_ptr;
 
+  deviceToHostCopy(lhs);
+  deviceToHostCopy(rhs);
+
   int m = lhs->dims.dim_sizes[0];
   int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
 
-  Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
+  Tensor *output = (Tensor *) create4DTensor(0, 0, m, n, 1, 1);
 
-  float *__restrict__ lhs_arr = (float *)lhs->host_data;
-  float *__restrict__ rhs_arr = (float *)rhs->host_data;
-  float *__restrict__ output_arr = (float *)output->host_data;
+  float *__restrict__ lhs_arr = (float *) lhs->host_data;
+  float *__restrict__ rhs_arr = (float *) rhs->host_data;
+  float *__restrict__ output_arr = (float *) output->host_data;
 
   int k = 1;
 #pragma unroll 4 // Can we unroll more???
@@ -964,6 +991,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
   }
   float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
   omp_set_num_threads(4);
+
 #pragma omp parallel for simd
   for (int l = 0; l < k; l++) {
     for (int j = 0; j < n; j++) {
@@ -982,47 +1010,71 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
       output_arr[i * n + j] = sum;
     }
   }
+  
   free(tran_rhs);
+  
   return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
-  Tensor *input = (Tensor *)input_ptr;
 
-  float *logits = (float *)input->host_data;
+  Tensor *input = (Tensor *) input_ptr;
+
+  deviceToHostCopy(input);
+
+  float *logits = (float *) input->host_data;
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
+
   
   omp_set_num_threads(4);
 #pragma omp parallel for
   for (int i = 0; i < n; i++) {
-    float x = 0;
-    for (int j = i * c; j < c + i * c; j++) {
-      logits[j] = expf(logits[j]);
+
+    float max = logits[i * c];
+    for (unsigned int k = i * c; k < c + i * c; k++){
+      if (logits[k] > max){
+        max = logits[k];
+      }
+    }
+  
+    double x = 0;
+    for (int j = i * c; j < c + i * c; j++) {   
+      logits[j] = exp( logits[j] - max );
     }
 
 #pragma omp simd reduction(+ : x)
     for (int j = i * c; j < i * c + c; j++) {
       x += logits[j];
     }
-
+    
 #pragma omp simd
     for (int j = i * c; j < i * c + c; j++) {
       logits[j] /= x;
     }
+
+    //printf("logits[i * c] = %f \n ", logits[i * c]);
   }
+  
   return input;
 }
 
 void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
                          void *mean_ptr, void *variance_ptr, double epsilon) {
 
-  Tensor *input = (Tensor *)input_ptr;
-  Tensor *gamma = (Tensor *)gamma_ptr;
-  Tensor *beta = (Tensor *)beta_ptr;
-  Tensor *mean = (Tensor *)mean_ptr;
-  Tensor *variance = (Tensor *)variance_ptr;
-
+  Tensor *input = (Tensor *) input_ptr;
+  Tensor *gamma = (Tensor *) gamma_ptr;
+  Tensor *beta = (Tensor *) beta_ptr;
+  Tensor *mean = (Tensor *) mean_ptr;
+  Tensor *variance = (Tensor *) variance_ptr;
+
+  deviceToHostCopy(input);
+  deviceToHostCopy(gamma);
+  deviceToHostCopy(beta);
+  deviceToHostCopy(mean);
+  deviceToHostCopy(variance);
+  
+  
   float *__restrict__ host_image = (float *)input->host_data;
   float *__restrict__ host_beta = (float *)beta->host_data;
   float *__restrict__ host_gamma = (float *)gamma->host_data;
@@ -1056,7 +1108,10 @@ void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
 }
 
 void *tensorReluCPU(void *input_ptr) {
+
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;
 
@@ -1069,7 +1124,10 @@ void *tensorReluCPU(void *input_ptr) {
 }
 
 void *tensorRelu2CPU(void *input_ptr, float min, float max) {
+
   Tensor *input = (Tensor *)input_ptr;
+  deviceToHostCopy(input);
+  
   float *input_data = (float *)input->host_data;
   size_t num_elems = input->num_elems;
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 253f7614337908e72c82ba986f860dd58c7c9b3f..9f69e54437766fcb8b549b7cbec1510bd6badbd7 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -52,7 +52,7 @@ void *tensorAdd(void *x_ptr, void *bias_ptr) {
   Tensor *x = (Tensor *)x_ptr;
   Tensor *bias = (Tensor *)bias_ptr;
 
-  INFO("*** TensorAdd \n");
+  //INFO("*** TensorAdd \n");
   profileEvent("Add");
 
   float alpha = 1.0f;
@@ -85,7 +85,8 @@ void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad,
                         int horizontal_pad, int vertical_stride,
                         int horizontal_stride, int conv_mode, int conv_groups) {
 
-  INFO("*** TensorConvolution \n");
+
+  //INFO("*** TensorConvolution \n");
   profileEvent("Conv");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -213,7 +214,6 @@ void *tensorPooling(void *input_ptr, int poolFunction, int window_height,
                     int window_width, int vertical_pad, int horizontal_pad,
                     int vertical_stride, int horizontal_stride) {
 
-  INFO("*** TensorPooling \n");
   profileEvent("Pool");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -272,7 +272,7 @@ void *tensorPooling(void *input_ptr, int poolFunction, int window_height,
  * https://gist.github.com/peterwittek/6303527 */
 void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) {
 
-  INFO("*** TensorGemmGPU \n");
+  //INFO("*** TensorGemmGPU \n");
   profileEvent("Mul");
 
   Tensor *lhs = (Tensor *)lhs_ptr;
@@ -364,7 +364,7 @@ void *tensorRelu(void *input_ptr) {
 // Think: Should Softmax be broken into multiple IR operations?
 void *tensorSoftmax(void *input_ptr) {
 
-  INFO("*** TensorSoftmax \n");
+  //INFO("*** TensorSoftmax \n");
   profileEvent("Softmax");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -386,7 +386,7 @@ void *tensorSoftmax(void *input_ptr) {
 
 void *tensorRelu2(void *input_ptr, float min, float max) {
 
-  INFO("*** TensorClippedRelu *** \n");
+  //INFO("*** TensorClippedRelu *** \n");
   profileEvent("Relu");
 
   cudnnActivationDescriptor_t reluDesc;
@@ -413,7 +413,7 @@ void *tensorRelu2(void *input_ptr, float min, float max) {
 
 void *tensorTanh(void *input_ptr) {
 
-  INFO("*** TensorTanh \n");
+  //INFO("*** TensorTanh \n");
   profileEvent("Tanh");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -441,7 +441,7 @@ void *tensorTanh(void *input_ptr) {
 void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
                       void *mean_ptr, void *variance_ptr, double epsilon) {
 
-  INFO("*** TensorBatchNorm \n");
+  // INFO("*** TensorBatchNorm \n");
   profileEvent("BatchNorm");
 
   Tensor *input = (Tensor *)input_ptr;
@@ -477,7 +477,7 @@ void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr,
 // TODO: benchmark performance of tensorSplit
 void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) {
 
-  INFO("*** TensorSplit \n");
+  //INFO("*** TensorSplit \n");
   profileEvent("tensorSplit");
 
   Tensor *tensor = (Tensor *)tensor_ptr;
@@ -533,7 +533,7 @@ void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) {
 
 void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) {
 
-  INFO("*** TensorConcat \n");
+  //INFO("*** TensorConcat \n");
   profileEvent("tensorConcat");
 
   Tensor **tensors = (Tensor **)tensors_ptr;
@@ -595,7 +595,7 @@ void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) {
 void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha,
                 double LRN_beta, double LRN_k) {
 
-  INFO("*** TensorLRN \n");
+  //INFO("*** TensorLRN \n");
   profileEvent("tensorLRN");
 
   Tensor *input = (Tensor *)input_ptr;
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
index f6bfe700b44c88fea06c6a76267b49af4a523716..4934f5834f07a37d91575a7b5821ec243762b52a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
@@ -289,6 +289,7 @@ void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
 
 void *create4DTensor(int data_type, int data_format, size_t dim1_size,
                      size_t dim2_size, size_t dim3_size, size_t dim4_size) {
+  
   struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
   size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
   allocateMem(tensor, data_type, num_elems);
@@ -308,15 +309,16 @@ void *create4DTensor(int data_type, int data_format, size_t dim1_size,
   set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
                         dim4_size);
 
+  changeTensorPlacement(tensor, HOST);
+  
   return tensor;
 }
 
 void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
-  Tensor *tensor = (Tensor *)tensor_ptr;
-
+  Tensor *tensor = (Tensor *) tensor_ptr;
   size_t host_size_in_bytes = tensor->num_elems * 4;
-  // if(tensor->size_in_bytes != size_in_bytes){
+
   if (host_size_in_bytes != size_in_bytes) {
     ERROR("The destination and source sizes don't match");
   }
@@ -330,29 +332,33 @@ void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
 void hostToDeviceCopy(struct Tensor *tensor) {
 
+  DEBUG("** HostToDevice *** \n");
   if (tensor->data_placement != DEVICE) {
     cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
                cudaMemcpyHostToDevice);
     DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
     tensor->data_placement = DEVICE;
-  } else {
+  }
+  else {
     DEBUG("No data movement required - Data on Device \n");
   }
 }
 
 void deviceToHostCopy(struct Tensor *tensor) {
 
+  DEBUG("*** DeviceToHost *** ");
   if (tensor->data_placement != HOST) {
     cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
                cudaMemcpyDeviceToHost);
     DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
     tensor->data_placement = HOST;
-  } else {
+  }
+  else {
     DEBUG("No data movement required - Data on Host \n");
   }
 }
 
-// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
+  
 
 void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
 
@@ -364,7 +370,8 @@ void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
            srcTensor->size_in_bytes);
     DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
     dstTensor->data_placement = HOST;
-  } else if (srcTensor->data_placement == DEVICE) {
+  }
+  else if (srcTensor->data_placement == DEVICE) {
     cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data,
                srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice);
     DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
@@ -382,7 +389,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) {
                  cudaMemcpyDeviceToHost);
       DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
       tensor->data_placement = HOST;
-    } else {
+    }
+    else {
       DEBUG("No data movement required - Data on Host \n");
     }
   }
@@ -394,7 +402,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) {
                  cudaMemcpyHostToDevice);
       DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
       tensor->data_placement = DEVICE;
-    } else {
+    }
+    else {
       DEBUG("No data movement required - Data on Device \n");
     }
   }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
index c2e116a56fbf038628396eeb611711295a4a9170..a972d097bf29963eae34f9e3521ee6b76d1c5a9e 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu
@@ -75,9 +75,7 @@ extern "C" {
 
 
 
-
-void *
-wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
+  void* wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
                   void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h,
                   int conv_stride_w, int pool_id, int pool_size,
                   int activation_id,
@@ -123,7 +121,6 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
     void *activation_out;
     switch (activation_id) {
     case -1: { // No activation
-      // INFO("No activation Function\n");
       activation_out = add_out;
     } break;
     case 0: { // TanH activation
@@ -204,7 +201,7 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
 
       std::vector< std::pair< CPUNodeConfiguration::TENSOR_OP,
       			std::vector< std::pair<CPUNodeConfiguration::APPROX,
-      					       int> > > > &ApproxChoices = CPUConf->getApproxChoices();
+					       int> > > > &ApproxChoices = CPUConf->getApproxChoices();
       
       // Check for convolution as first operation
       CUSTOM_ASSERT((ApproxChoices.size() >= 1) &&
@@ -230,7 +227,6 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
     	switch (activation_id) {
     	case -1:
     	  { // No activation
-    	    INFO("No activation Function\n");
     	    activation_out = add_out;
     	  }
     	  break;
@@ -325,15 +321,14 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter,
   return NULL;
 }
 
-void *wrapper_ConvLayer2(
-    const char *hpvm_node_id, void *input, void *filter, void *bias,
-    int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
-    int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
-    int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
-    // NOTE: out_min, out_max are only relevant for ClippedRelu
-    float out_min, float out_max) {
+void *wrapper_ConvLayer2(const char *hpvm_node_id, void *input, void *filter, void *bias,
+			 int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
+			 int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v,
+			 int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id,
+			 // NOTE: out_min, out_max are only relevant for ClippedRelu
+			 float out_min, float out_max) {
 
-  //INFO("*** ------Conv Layer \n");
+  INFO("***  ConvLayer \n");
 
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
   if (NodeConf->isGPUNodeConfiguration()) {
@@ -345,11 +340,6 @@ void *wrapper_ConvLayer2(
                   std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
         &ApproxChoices = GPUConf->getApproxChoices();
 
-    // printf("*** Convolution \n ApproxChoice = %d \n  BatchNorm = %d \n CONV =
-    // %d \n", ApproxChoices[0].first,
-    //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-    //       GPUNodeConfiguration::TENSOR_OP::CONV);
-
     // Check for convolution as first operation
     CUSTOM_ASSERT(
         (ApproxChoices.size() >= 1) &&
@@ -377,7 +367,6 @@ void *wrapper_ConvLayer2(
     void *activation_out;
     switch (activation_id) {
     case -1: { // No activation
-      // INFO("No activation Function\n");
       activation_out = add_out;
     } break;
     case 0: { // TanH activation
@@ -531,7 +520,6 @@ void *wrapper_ConvLayer2(
        }
 
        void* pool_out;
-
        if (pool_size_v > 0) {
          switch (pool_id) {
          case 0:
@@ -599,6 +587,8 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
                 // NOTE: out_min and out_max are only relevant for ClippedRelu
                 float out_min, float out_max) {
 
+  INFO("***  DenseLayer \n");
+    
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
   if (NodeConf->isGPUNodeConfiguration()) {
     DEBUG("GPU Configuration for FCLayer\n");
@@ -628,7 +618,6 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
       CUSTOM_ASSERT(
           (ApproxChoices.size() == 2) &&
           "Incorrect number of operations in provided FC layer configuration");
-      // INFO("No activation Function\n");
       activation_out = add_out;
     } break;
     case 0: { // TanH activation
@@ -739,7 +728,7 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights,
 
 void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) {
 
-  INFO("*** Relu Operation \n");
+  INFO("***  TensorRelu \n");
 
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
 
@@ -893,10 +882,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
                               void *gamma_ptr, void *beta_ptr, void *mean_ptr,
                               void *variance_ptr, double epsilon) {
 
-  INFO("*** BatchNorm Operation \n");
-
+  INFO("***  TensorBatchNorm  \n");
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
-
   
   if (NodeConf->isGPUNodeConfiguration()) {
       // mapped to GPU - get a GPU configuration
@@ -907,14 +894,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
           std::pair<GPUNodeConfiguration::TENSOR_OP,
                     std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>>
           &ApproxChoices =
-
               GPUConf->getApproxChoices();
 
-      // printf("*** BatchNorm \n ApproxChoice = %d \n  BatchNorm = %d \n CONV = %d
-      // \n", ApproxChoices[0].first,
-      //	       GPUNodeConfiguration::TENSOR_OP::BATCHNORM,
-      //	       GPUNodeConfiguration::TENSOR_OP::CONV);
-
       // Approximation choices must be for a batchnorm operation
       CUSTOM_ASSERT(
           ApproxChoices.size() == 1 &&
@@ -955,6 +936,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr,
 void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
                         void *bias_ptr) {
 
+  INFO("***  TensorAdd \n");
+  
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
 
   if (NodeConf->isGPUNodeConfiguration()) {
@@ -981,7 +964,7 @@ void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
   } else if (NodeConf->isCPUNodeConfiguration()) {
       DEBUG("Add operation: CPU Configuration\n");
       // Mapped to CPU - get a CPU configuration
-      CPUNodeConfiguration *CPUConf = (CPUNodeConfiguration *)NodeConf;
+      CPUNodeConfiguration *CPUConf = (CPUNodeConfiguration *) NodeConf;
 
       std::vector< std::pair< CPUNodeConfiguration::TENSOR_OP,
       		    std::vector< std::pair<CPUNodeConfiguration::APPROX,
@@ -994,7 +977,7 @@ void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr,
       	  "Invalid configuration generated for tensor add wrapper operation");
 
       return handleTensorAddApproximationTuples_CPU(ApproxChoices[0].second,
-				            input_ptr, bias_ptr);
+						    input_ptr, bias_ptr);
   } else {
       ERROR("Unsupported Configuration");
       abort();
@@ -1008,7 +991,7 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr,
                             int horizontal_pad, int vertical_stride,
                             int horizontal_stride) {
 
-  INFO("*** TensorPooling Operation \n");
+  INFO("***  TensorPooling  \n");
 
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
 
@@ -1084,6 +1067,9 @@ void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input,
                                      int horizontal_pad, int vertical_stride,
                                      int horizontal_stride, int conv_mode,
                                      int conv_groups) {
+
+  INFO("***  TensorGroupConv  \n");
+  
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
 
   if (NodeConf->isGPUNodeConfiguration()) {
@@ -1137,8 +1123,9 @@ void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input,
 }
 
 void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) {
-  //  return tensorSoftmax(input_ptr);
 
+  INFO("***  TensorSoftmax \n ");
+  
   NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id);
   if (NodeConf->isGPUNodeConfiguration()) {