diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
index 6e81c7a3fbfbe4cae3cd1c40f43a4c7d5ea2d7c8..4d89d38ad193164027c2e7fde78764df3cdd7a92 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h
@@ -33,14 +33,11 @@ struct Tensor {
   int data_type;
   int cur_type;
   int data_format;
-  data_location_t
-      data_placement; // Maintains the location of the tensor {host, device...}
-  cudnnTensorDescriptor_t tensor_desc;
-  cudnnFilterDescriptor_t
-      filter_desc; // FIXIT: Rethink if this should be in tensor struct
-  cudnnTensorDescriptor_t tensor_half_desc;
-  cudnnFilterDescriptor_t
-      filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
+  data_location_t  data_placement; // Maintains the location of the tensor {host, device...}
+  cudnnTensorDescriptor_t  tensor_desc;
+  cudnnFilterDescriptor_t  filter_desc; // FIXIT: Rethink if this should be in tensor struct
+  cudnnTensorDescriptor_t  tensor_half_desc;
+  cudnnFilterDescriptor_t  filter_half_desc; // FIXIT: Rethink if this should be in tensor struct
   void *host_data;
   void *gpu_data;       // Pointer to GPU FP32 data
   void *gpu_half_data;  // Pointer to GPU FP16 data
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
index 22f202ef039ed45eb0d2690f2c4a35c9b73e4bc5..698ab026dac0324cb456f69b4398111ac708412d 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h
@@ -201,8 +201,7 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
   long int size_in_bytes =
       type_size * dim1_size * dim2_size * dim3_size * dim4_size;
   float *tensor_data = (float *)malloc(sizeof(float) * num_elems);
-  //printf("size_in_bytes  = %lu \n", size_in_bytes);
-
+ 
   int file_header_size = 0;
 
   FILE *file = fopen(file_name, "rb");
@@ -220,12 +219,13 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type,
       data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
   initTensorData(weights, tensor_data, size_in_bytes);
-  // compareValues(weights, tensor_data, num_elems);
+
   free(tensor_data);
 
   return weights;
 }
 
+
 struct Tensor *readInputBatch(const char *file_name, long data_type,
 			      long start, long end,
 			      long dim2_size, long dim3_size, long dim4_size) {
@@ -252,8 +252,7 @@ struct Tensor *readInputBatch(const char *file_name, long data_type,
   fclose(file);
 
 
-  struct Tensor *weights = (struct Tensor *)create4DTensor(
-      data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
+  struct Tensor *weights = (struct Tensor *) create4DTensor(data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size);
 
   initTensorData(weights, tensor_data, size_in_bytes);
   free(tensor_data);
@@ -589,3 +588,4 @@ void dumpOutput(void *output_ptr, const char *file_name) {
 }
 
 #endif
+
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
index 9f17a3dc5134e576ec0ba9819821247bc1e5fa6c..2d0ba3288b6e440046655a0acef9454b335ce55c 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp
@@ -1191,7 +1191,7 @@ static int num_executations = 0;
 
 float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
-  struct Tensor *result = (struct Tensor *)result_ptr;
+  struct Tensor *result = (struct Tensor *) result_ptr;
 
   size_t batch_dim = result->dims.dim_sizes[0];
   size_t num_classes = result->dims.dim_sizes[1];
@@ -1204,6 +1204,7 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) {
 
     int chosen = 0;
     for (int id = 1; id < num_classes; ++id) {
+      //printf(" check = %f \n ",  data[i * num_classes + id]);
       if (data[i * num_classes + chosen] < data[i * num_classes + id])
         chosen = id;
     }
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index 95c17752fd3fd0e441f2c6335591eeb741105db9..dcc6e2e7ce832afc24b560006cb07272f08842cf 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -873,8 +873,9 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
   int x_radius = (window_width - 1) / 2;
   int y_radius = (window_height - 1) / 2;
 
-  Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
+  Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels,
                                                output_height, output_width);
+  
   float *__restrict__ output_data = (float *)output->host_data;
 
   omp_set_num_threads(4);
@@ -891,10 +892,10 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
           int y_radius_var_max = y_radius_var + image_height;
           int x_radius_var = x_radius - c;
           int x_radius_var_max = x_radius_var + image_width;
-          int ki_min =
-              (y_radius_var > 0)
-                  ? ((y_radius_var < window_height) ? y_radius_var : -1)
-                  : 0;
+          int ki_min = (y_radius_var > 0)
+	    ? ((y_radius_var < window_height) ? y_radius_var : -1)
+	    : 0;
+	  
           int ki_max = (y_radius_var_max < window_height)
                            ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1)
                            : window_height;
@@ -977,11 +978,11 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
   int m = lhs->dims.dim_sizes[0];
   int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
 
-  Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
+  Tensor *output = (Tensor *) create4DTensor(0, 0, m, n, 1, 1);
 
-  float *__restrict__ lhs_arr = (float *)lhs->host_data;
-  float *__restrict__ rhs_arr = (float *)rhs->host_data;
-  float *__restrict__ output_arr = (float *)output->host_data;
+  float *__restrict__ lhs_arr = (float *) lhs->host_data;
+  float *__restrict__ rhs_arr = (float *) rhs->host_data;
+  float *__restrict__ output_arr = (float *) output->host_data;
 
   int k = 1;
 #pragma unroll 4 // Can we unroll more???
@@ -1009,26 +1010,35 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
       output_arr[i * n + j] = sum;
     }
   }
+  
   free(tran_rhs);
+  
   return output;
 }
 
 void *tensorSoftmaxCPU(void *input_ptr) {
 
-  Tensor *input = (Tensor *)input_ptr;
+  Tensor *input = (Tensor *) input_ptr;
 
   deviceToHostCopy(input);
 
-  float *logits = (float *)input->host_data;
+  float *logits = (float *) input->host_data;
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
+
+  float max = logits[0];
+  for (unsigned int i = 0; i < n * c; i++){
+    if (logits[i] > max){
+      max = logits[i];
+    }
+  }
   
   omp_set_num_threads(4);
 #pragma omp parallel for
   for (int i = 0; i < n; i++) {
-    float x = 0;
+    double x = 0;
     for (int j = i * c; j < c + i * c; j++) {
-      logits[j] = expf(logits[j]);
+      logits[j] = exp(logits[j] / max );
     }
 
 #pragma omp simd reduction(+ : x)
@@ -1036,11 +1046,14 @@ void *tensorSoftmaxCPU(void *input_ptr) {
       x += logits[j];
     }
 
+    //printf("x = %f \n ", x);
+    
 #pragma omp simd
     for (int j = i * c; j < i * c + c; j++) {
       logits[j] /= x;
     }
   }
+  
   return input;
 }
 
diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
index f6bfe700b44c88fea06c6a76267b49af4a523716..4934f5834f07a37d91575a7b5821ec243762b52a 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu
@@ -289,6 +289,7 @@ void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size,
 
 void *create4DTensor(int data_type, int data_format, size_t dim1_size,
                      size_t dim2_size, size_t dim3_size, size_t dim4_size) {
+  
   struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
   size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
   allocateMem(tensor, data_type, num_elems);
@@ -308,15 +309,16 @@ void *create4DTensor(int data_type, int data_format, size_t dim1_size,
   set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size,
                         dim4_size);
 
+  changeTensorPlacement(tensor, HOST);
+  
   return tensor;
 }
 
 void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
-  Tensor *tensor = (Tensor *)tensor_ptr;
-
+  Tensor *tensor = (Tensor *) tensor_ptr;
   size_t host_size_in_bytes = tensor->num_elems * 4;
-  // if(tensor->size_in_bytes != size_in_bytes){
+
   if (host_size_in_bytes != size_in_bytes) {
     ERROR("The destination and source sizes don't match");
   }
@@ -330,29 +332,33 @@ void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
 
 void hostToDeviceCopy(struct Tensor *tensor) {
 
+  DEBUG("** HostToDevice *** \n");
   if (tensor->data_placement != DEVICE) {
     cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes,
                cudaMemcpyHostToDevice);
     DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
     tensor->data_placement = DEVICE;
-  } else {
+  }
+  else {
     DEBUG("No data movement required - Data on Device \n");
   }
 }
 
 void deviceToHostCopy(struct Tensor *tensor) {
 
+  DEBUG("*** DeviceToHost *** ");
   if (tensor->data_placement != HOST) {
     cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes,
                cudaMemcpyDeviceToHost);
     DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
     tensor->data_placement = HOST;
-  } else {
+  }
+  else {
     DEBUG("No data movement required - Data on Host \n");
   }
 }
 
-// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){
+  
 
 void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
 
@@ -364,7 +370,8 @@ void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) {
            srcTensor->size_in_bytes);
     DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes);
     dstTensor->data_placement = HOST;
-  } else if (srcTensor->data_placement == DEVICE) {
+  }
+  else if (srcTensor->data_placement == DEVICE) {
     cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data,
                srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice);
     DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes);
@@ -382,7 +389,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) {
                  cudaMemcpyDeviceToHost);
       DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes);
       tensor->data_placement = HOST;
-    } else {
+    }
+    else {
       DEBUG("No data movement required - Data on Host \n");
     }
   }
@@ -394,7 +402,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) {
                  cudaMemcpyHostToDevice);
       DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes);
       tensor->data_placement = DEVICE;
-    } else {
+    }
+    else {
       DEBUG("No data movement required - Data on Device \n");
     }
   }