diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
index dcc6e2e7ce832afc24b560006cb07272f08842cf..700793529f1a7e1e9d8c887c28e7aefbd9afba93 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc
@@ -1026,32 +1026,34 @@ void *tensorSoftmaxCPU(void *input_ptr) {
   int n = input->dims.dim_sizes[0];
   int c = input->dims.dim_sizes[1];
 
-  float max = logits[0];
-  for (unsigned int i = 0; i < n * c; i++){
-    if (logits[i] > max){
-      max = logits[i];
-    }
-  }
   
   omp_set_num_threads(4);
 #pragma omp parallel for
   for (int i = 0; i < n; i++) {
+
+    float max = logits[i * c];
+    for (unsigned int k = i * c; k < c + i * c; k++){
+      if (logits[k] > max){
+        max = logits[k];
+      }
+    }
+  
     double x = 0;
-    for (int j = i * c; j < c + i * c; j++) {
-      logits[j] = exp(logits[j] / max );
+    for (int j = i * c; j < c + i * c; j++) {   
+      logits[j] = exp( logits[j] - max );
     }
 
 #pragma omp simd reduction(+ : x)
     for (int j = i * c; j < i * c + c; j++) {
       x += logits[j];
     }
-
-    //printf("x = %f \n ", x);
     
 #pragma omp simd
     for (int j = i * c; j < i * c + c; j++) {
       logits[j] /= x;
     }
+
+    //printf("logits[i * c] = %f \n ", logits[i * c]);
   }
   
   return input;