diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2fc2c9493453f55cb83094373b19a24b59135d4
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/knobs.txt
@@ -0,0 +1,6 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..063ba473d6a7fa57d7572c86dde9beac0932163d
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/knobs.txt
@@ -0,0 +1,7 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..84103fc103c4e1e4e7cdeec40bc61e2ea4852052
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/global_knobs.txt
@@ -0,0 +1,14 @@
+fp32,11	-1	1.0	tensorConvolution	tensorConvolution
+fp16,12	-1	1.5     tensorConvolution	tensorHalfConvolution
+perf,21 1,2,0	2.25    tensorConvolution	tensorConvPerfCuda
+perf,22 1,2,1   2.25    tensorConvolution	tensorConvPerfCuda
+perf,23 1,3,0	1.88	tensorConvolution	tensorConvPerfCuda
+perf,24	1,3,1   1.88	tensorConvolution	tensorConvPerfCuda
+perf,25	2,1,0   2.25	tensorConvolution	tensorConvPerfCuda
+perf,26	2,1,1   2.25	tensorConvolution	tensorConvPerfCuda
+perf,27	3,1,0   1.88	tensorConvolution	tensorConvPerfCuda
+perf,28	3,1,1   1.88	tensorConvolution	tensorConvPerfCuda
+samp,31	2,0     2.25	tensorConvolution	tensorConvSampSim
+samp,32	2,1     2.25	tensorConvolution	tensorConvSampSim
+samp,33	4,0     1.8	tensorConvolution	tensorConvSampSim
+samp,34	4,1     1.8	tensorConvolution	tensorConvSampSim
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be1ce58c95981535ec94a7f8badffe967cfed586
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/knobs.txt
@@ -0,0 +1,4 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6719acb97a58bd7f3d9fbe428f755e13df98b3d0
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..719d96e48168a477d6edfee1a02b80b554612ec7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/knobs.txt
@@ -0,0 +1,8 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b7ff033cec2b85390ce6c7667fbbb04837a7eaf9
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/knobs.txt
@@ -0,0 +1,22 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fb54e7f077eaf27d7182e273fae31a867d8cbb9f
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/knobs.txt
@@ -0,0 +1,15 @@
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12,21,22,23,24,25,26,27,28,31,32,33,34
+11,12
+11,12
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
index d5a1e903f644c4d27477bac4d8587fb177b58021..66070f3058d840e4dbe25919e33aa8abc060b330 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
@@ -16,8 +16,7 @@
 #include "op_overheads.h"
 #include "half_precision_api.h"
 #include "approx_techniques2.h"
-
-
+#include <unordered_map>
 
 
 
@@ -240,8 +239,6 @@ void sampleFilterElems(int N,
 
     //int local_index = row * w + col;
     int local_index = (ch * (h * w)) + (row * w) + col;
-
-    //data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 1.0;
     
     if(local_index % skip_elem  == skip_offset)
        data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0;
@@ -306,6 +303,10 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  
   // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
   sampleFilter(filter, skip_rate, skip_offset);
   
@@ -344,7 +345,7 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
   DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
   Tensor* output;
-  output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+  output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, 
 				      CUDNN_TENSOR_NCHW, n, c, h, w);
   
 
@@ -608,26 +609,58 @@ void* PROMISE_FC(void* input, float i_min, float i_max,
 // NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper
 //#define OLD_MODEL
 
-
 #ifndef OLD_MODEL
 
 
-/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */
 
-void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
-			void* filter, float w_min, float w_max,
-			void* bias, float b_min, float b_max,
-			int conv_pad_h, int conv_pad_w,
-			int conv_stride_h, int conv_stride_w,
-			int pool_id, int pool_size,
-			int activation_id, // Relu, Tanh, ClipRelu
-			float out_min, float out_max, int swing){ 
+bool isPromise(int swing){
 
+  if(swing < 8)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isFullPrecision(int swing){
+
+  if(swing == 11)
+    return true;
+  else
+    return false;      
+}
+
+
+
+bool isHalfPrecision(int swing){
+
+  if(swing == 12)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isPerforation(int swing){
+
+  if(swing >= 21 && swing <= 29)
+    return true;
+  else
+    return false;      
+}
+
+
+bool isSampling(int swing){
+
+  if(swing >= 31 && swing <= 39)
+    return true;
+  else
+    return false;      
+}
+
+
+int getSwing(int swing){
 
-  if(ONLINE_PROFILING){
-    ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n");
-  }
-  
   #ifdef PROMISE_TUNER_ENABLED
 
   // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
@@ -640,14 +673,142 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
     swing = op_accuracies[op_counter];
     op_counter++;
   }
-  
+
   #endif  
 
+   DEBUG("---- swing_value = %d \n", swing);  
 
+   return swing;
+}
+
+
+
+
+
+
+class PerfParams{
+
+ public:
+  int row;
+  int col;
+  int skip_offset;
+
+  PerfParams(){
+    row = 1;
+    col = 1;
+    skip_offset = 0;
+  }
   
-  DEBUG("---- swing_value = %d \n", swing);  
+  PerfParams(int row1, int col1, int skip_offset1){
+    row = row1;
+    col = col1;
+    skip_offset = skip_offset1;
+  }
+ 		
+};
 
-  if(swing < 8){
+
+
+PerfParams getPerfParams(int swing){
+
+  std::map<int, PerfParams> perf_knob_map;
+
+  PerfParams params21(1, 2, 0);
+  perf_knob_map[21] = params21;
+
+  PerfParams params22(1, 2, 1);
+  perf_knob_map[22] = params22;
+
+  PerfParams params23(1, 3, 0);
+  perf_knob_map[23] = params23;
+
+  PerfParams params24(1, 3, 1);
+  perf_knob_map[24] = params24;
+
+  PerfParams params25(2, 1, 0);
+  perf_knob_map[25] = params25;
+
+  PerfParams params26(2, 1, 1);
+  perf_knob_map[26] = params26;
+
+  PerfParams params27(3, 1, 0);
+  perf_knob_map[27] = params27;
+
+  PerfParams params28(3, 1, 1);
+  perf_knob_map[28] = params28;
+
+  
+  return perf_knob_map[swing];
+  
+}
+
+
+
+
+class SampParams{
+
+ public:  
+  int skip_rate;
+  int skip_offset;
+
+  SampParams(){
+    skip_rate = 1;
+    skip_offset = 0;
+  }
+  
+  SampParams(int skip_rate1, int skip_offset1){
+    skip_rate = skip_rate1;
+    skip_offset = skip_offset1;
+  }
+ 		
+};
+
+
+
+SampParams getSampParams(int swing){
+
+  std::map<int, SampParams> samp_knob_map;
+
+  SampParams params31(2, 0);
+  samp_knob_map[31] = params31;
+
+  SampParams params32(2, 1);
+  samp_knob_map[32] = params32;
+
+  SampParams params33(4, 0);
+  samp_knob_map[33] = params33;
+
+  SampParams params34(4, 1);
+  samp_knob_map[34] = params34;
+
+  return samp_knob_map[swing];
+  
+}
+
+
+
+
+
+
+/***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */
+
+void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
+			void* filter, float w_min, float w_max,
+			void* bias, float b_min, float b_max,
+			int conv_pad_h, int conv_pad_w,
+			int conv_stride_h, int conv_stride_w,
+			int pool_id, int pool_size,
+			int activation_id, // Relu, Tanh, ClipRelu
+			float out_min, float out_max, int swing){ 
+
+  if(ONLINE_PROFILING){
+    ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n");
+  }
+  
+
+  swing = getSwing(swing);  
+ 
+  if(isPromise(swing)){
     
     return PROMISE_Conv(input, i_min, i_max,
 			filter, w_min, w_max,
@@ -662,47 +823,52 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
   
   
   void* conv_out;
-  if(swing >= 21 && swing <= 29){
+  if(isPerforation(swing)){
  
-    int rows = 2;
-    int cols = 1;
+    PerfParams params = getPerfParams(swing);
+    DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n",
+	  params.row, params.col, params.skip_offset);
     
-    switch(swing){
-
-      // Check if col/row == 1 in baseline?  
-    case 21: rows = 2; cols = 1; break;
-    case 22: rows = 2; cols = 1; break;
-    case 23: rows = 1; cols = 2; break;
-    case 24: rows = 1; cols = 2; break;    
-      //case 25: rows = 2; cols = 2; break;
-      //case 26: rows = 2; cols = 2; break;    	   
-      //default: rows = 2; cols = 2; break;
-    }
+    conv_out = tensorConvPerfCuda(input, filter,
+				  conv_pad_h, conv_pad_w,
+				  conv_stride_h, conv_stride_w, 1, 1,
+				  params.row, params.col, params.skip_offset);
 
+  }
 
-    // FIXIT: See if start is from 0 or 1?
-    int start = (swing % 2) + 1;
-    conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w,
-				  conv_stride_h, conv_stride_w, 1, 1,
-				  rows, cols, start);
+  if(isSampling(swing)){
+ 
+    SampParams params = getSampParams(swing);
+    DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
+	  params.skip_rate, params.skip_offset);
+    
+    conv_out = tensorConvSampSim(input, filter,
+				 conv_pad_h, conv_pad_w,
+				 conv_stride_h, conv_stride_w, 1, 1,
+				 params.skip_rate, params.skip_offset);
 
   }
-  else if(swing == 12){  
+  
+
+  if (isHalfPrecision(swing)){
+
     conv_out = tensorHalfConvolution(input, filter,
 				     conv_pad_h, conv_pad_w,
 				     conv_stride_h, conv_stride_w,
 				     1, 0);
   }
-  else{
+
+  if (isFullPrecision(swing)){
     conv_out = tensorConvolution(input, filter,
 				 conv_pad_h, conv_pad_w,
 				 conv_stride_h, conv_stride_w,
 				 1, 0);
   }
+
   
   void* conv_add;
   if(bias != NULL){
-    if(swing >= 12){  
+    if( !isFullPrecision(swing) ){  
       conv_add = tensorHalfAdd(conv_out, bias);
     }
     else{
@@ -714,10 +880,9 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
   }
 
   void* pool_out;
-  // NOTE: Skip pooling on negative pool sizes
   if(pool_size > 0){
     //FIXME: Currently only using MaxPooling
-    pool_out = tensorPooling(conv_add, 0, pool_size, pool_size,
+    pool_out = tensorHalfPooling(conv_add, 0, pool_size, pool_size,
 				 0, 0, pool_size, pool_size);
   }
   else{
@@ -731,13 +896,13 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
     INFO("NO Activation Function \n");
     break;
   case 0:
-    activation_out = tensorTanh(pool_out);
+    activation_out = tensorHalfTanh(pool_out);
     break;
   case 1:
-    activation_out = tensorRelu(pool_out);
+    activation_out = tensorHalfRelu(pool_out);
     break;
   case 2:
-    activation_out = tensorRelu2(pool_out, out_min, out_max);
+    activation_out = tensorHalfRelu2(pool_out, out_min, out_max);
     break;
   default:
     ERROR("Activation id %d NOT supported \n", activation_out);
@@ -756,25 +921,9 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
 		      float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu'
 
 
+  swing = getSwing(swing);
   
-  #ifdef PROMISE_TUNER_ENABLED
-
-  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
-  if(!approxhpvm_runtime_mode){
-
-    if(op_counter >= total_ops){
-      ERROR("No accuracy flag found \n");
-    }
-  
-    swing = op_accuracies[op_counter];
-    op_counter++;
-  }
-  
-  #endif
-
-
-  
-  if(swing < 8){
+  if(isPromise(swing)){
 
     return PROMISE_FC(input, i_min, i_max,
 		      weights, w_min, w_max,
@@ -786,7 +935,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
 
   
   void* gemm_out;
-  if(swing >= 12){
+  if(!isFullPrecision(swing)){
     gemm_out = tensorHalfGemm(input, weights);
   }
   else{
@@ -797,7 +946,7 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
   void* gemmbias_out;
   if(bias != NULL){
     // Swing 8 corresponds to FP32
-    if(swing >= 12){
+    if(!isFullPrecision(swing)){
       gemmbias_out = tensorHalfAdd(gemm_out, bias);
     }
     else{
@@ -816,13 +965,13 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
     INFO("No Activation Function \n");
     break;
   case 0:
-    activation_out = tensorTanh(gemmbias_out);
+    activation_out = tensorHalfTanh(gemmbias_out);
     break;
   case 1:
-    activation_out = tensorRelu(gemmbias_out);
+    activation_out = tensorHalfRelu(gemmbias_out);
     break;
   case 2:
-    activation_out = tensorRelu2(gemmbias_out, out_min, out_max);
+    activation_out = tensorHalfRelu2(gemmbias_out, out_min, out_max);
     break;
   default:
     ERROR("Activation id %d NOT supported \n", activation_out);
@@ -840,8 +989,20 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
 
 #ifdef OLD_MODEL
 
+#endif
+
+#endif 
+
+
+
 /************* NOTE: Outdated PROMISE routines - Used for Comparison ****/
 
+  
+
+
+/*
+
+
 
 void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
 			void* filter, float w_min, float w_max,
@@ -916,6 +1077,7 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
     DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
     */
 
+/*-------------
   }
   else if(swing == 9 || (swing >= 16 && swing <= 19) ){
     //conv_out = tensorConvPerf(input, filter, conv_pad_h, conv_pad_w,
@@ -948,6 +1110,8 @@ void* ConvLayer_PROMISE(void* input, float i_min, float i_max,
     DEBUG("\n-------- l2_norm = %f \n", norms->l2_norm); 
     */
 
+/*-----
+
   }
   else if(swing == 10){  
     conv_out = tensorHalfConvolution(input, filter,
@@ -1146,3 +1310,21 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max,
 
 
 
+
+  /*#ifdef PROMISE_TUNER_ENABLED
+
+  // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime
+  if(!approxhpvm_runtime_mode){
+
+    if(op_counter >= total_ops){
+      ERROR("No accuracy flag found \n");
+    }
+  
+    swing = op_accuracies[op_counter];
+    op_counter++;
+  }
+  
+  #endif
+
+  */
+
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
index af0ed1e202017dde2cb96e9f8798aff1219c0695..9689c6fce91d3a4093d91b5006ef1beee969f8eb 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques.h
@@ -819,7 +819,7 @@ __global__ void depthwise_conv4_half3(__half* const __restrict__ y,
 
 				__half t1;
 
-				int total = C_dim * H_dim * W_dim;
+				//int total = C_dim * H_dim * W_dim;
 				t1 = xdata[(m - bstartm) * H_dim * W_dim + (start_h + p - bstart_h) * W_dim +
 					start_w + q - bstart_w];
 
@@ -920,7 +920,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 			int vertical_stride, int horizontal_stride,
 			int conv_mode, int conv_groups){
 
-  llvm_hpvm_initTensorRt(0);
 
   INFO("*** TensorConvolution \n");
   profileEvent("Conv");
@@ -935,7 +934,13 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 
   Tensor* output;
 
-	
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(filter);
+
+  convertToFP32(input);
+  convertToFP32(filter);
+
+  
   if (conv_groups > 32) {
     // TODO: Support other cases;  
     hostToDeviceCopy(input);
@@ -949,7 +954,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
     w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
 
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
@@ -957,33 +962,6 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     // NOTE: Necessary to insert the above call for every output tensor
 
 
-    /*
-      if (c > 255) {
-      dim3 grid((n / 16), c);
-      dim3 block(h * w);
-      depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-
-      }*/
-
-    /*
-      dim3 grid((n / 12), c);
-      dim3 block(h * w);
-      depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
-      if(n % 12 > 0){ 
-      dim3 grid2((n % 12), c);
-      dim3 block(h * w);
-      depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
-      (float*)input->gpu_data, (float*)filter->gpu_data,
-      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
-      }
-    */
 		
     int blockSize;
     blockSize = 64;
@@ -994,7 +972,8 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 					 (float*)input->gpu_data, (float*)filter->gpu_data,
 					 input->dims.dim_sizes[0], input->dims.dim_sizes[1],
 					 input->dims.dim_sizes[2], input->dims.dim_sizes[3],
-					 KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+					 KH, KW, h, w, vertical_pad, horizontal_pad,
+					 vertical_stride, horizontal_stride);
 
   }
   else {
@@ -1043,11 +1022,11 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
     DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
 
     if (input->data_format == CUDNN_TENSOR_NCHW)
-      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 				       CUDNN_TENSOR_NCHW, n, c, h, w);
     else if (input->data_format == CUDNN_TENSOR_NHWC) {
       DEBUG("* NHWC Format \n");
-      output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+      output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				       CUDNN_TENSOR_NHWC, n, h, w, c);
     }
     else
@@ -1137,6 +1116,7 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr,
 
 }
 
+// FIXME: Need to properly fix the new HALF type conversion
 void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
 			    int vertical_pad, int horizontal_pad,
 			    int vertical_stride, int horizontal_stride,
@@ -1165,6 +1145,9 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
+  convertToFP16(input);
+  convertToFP16(filter);
+  
 
   /***** CONVERSIONS from FP32 to FP16 - on the GPU */
   size_t* input_dims = input->dims.dim_sizes;
@@ -1209,7 +1192,7 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr,
     DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
     
 
-    output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+    output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type,
 				      CUDNN_TENSOR_NCHW, n, c, h, w);
     // FIXIT: more checks for data types needed
     output_half = (Tensor*) create4DTensor(CUDNN_DATA_HALF,
@@ -1797,7 +1780,7 @@ void* tensorConvPerf(void* input_ptr, void* filter_ptr,
 
   Tensor* new_output;
   if(input->data_format == CUDNN_TENSOR_NCHW)
-    new_output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type,
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 					  CUDNN_TENSOR_NCHW, n, c, h, w);
   else if(input->data_format == CUDNN_TENSOR_NHWC){
     DEBUG("* NHWC Format \n");
@@ -2078,3 +2061,32 @@ void* tensorConvolutionKernelSamp(void* input_ptr, void* filter_ptr,
       #endif
   return output;
 }
+
+
+    /*
+      if (c > 255) {
+      dim3 grid((n / 16), c);
+      dim3 block(h * w);
+      depthwise_conv << <grid, block >> > ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+
+      }*/
+
+    /*
+      dim3 grid((n / 12), c);
+      dim3 block(h * w);
+      depthwise_conv12 <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride);
+      if(n % 12 > 0){ 
+      dim3 grid2((n % 12), c);
+      dim3 block(h * w);
+      depthwise_conv <<<grid, block >>> ((float*)output->gpu_data,
+      (float*)input->gpu_data, (float*)filter->gpu_data,
+      input->dims.dim_sizes[0], input->dims.dim_sizes[1], input->dims.dim_sizes[2], input->dims.dim_sizes[3],
+      KH, KW, h, w, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, 12 * (n/12));
+      }
+    */
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
index f66b80d7287611c44151eb5fa02b4fed3e7d4e7b..a81ffe296233178126555bbb53babdcd4192a7bf 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h
@@ -1,4 +1,6 @@
 
+#include "tensor_utils.cu"
+
 
 
 //This skips every xth row
@@ -173,10 +175,17 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
   if (conv_groups == 0) {
     conv_groups = 1;
   }
+  
   Tensor* output;
   // TODO: Support other cases;
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
+
+  
+  convertToFP32(input);
+  convertToFP32(filter);
+  
+  
   int n, c, h, w; // output dimensions
   n = input->dims.dim_sizes[0];
   c = filter->dims.dim_sizes[0]; //number of filters
@@ -196,7 +205,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 
   Tensor *new_output;
   if(row > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h_eff, w);
 
     // NOTE: Changing output tensor placement from host to device
@@ -231,7 +240,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 					      (float *)output->gpu_data, h_eff * w, c * h_eff * w,
 					      n));
 
-    new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 					 CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
@@ -239,7 +248,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
     //interpolate
     int numBlocks = (n * c * h * w  + 127) / 128;
     approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w,
-					    (float *)output->gpu_data, (float *)new_output->gpu_data,
+					    (float *) output->gpu_data, (float *) new_output->gpu_data,
 					    row, start);
     cudaDeviceSynchronize();
 
@@ -247,7 +256,8 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
     cudaFree(convData);
   }
   else if(col > 1){
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
@@ -282,7 +292,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 					      (float *)output->gpu_data, h * w_eff, c * h * w_eff,
 					      n));
 
-    new_output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 					 CUDNN_TENSOR_NCHW, n, c, h, w);
     // NOTE: Changing output tensor placement from host to device
     changeTensorPlacement(new_output, DEVICE);
@@ -298,7 +308,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
     cudaFree(convData);
   }
   else{
-    output = (Tensor*)create4DTensor((cudnnDataType_t)input->data_type,
+    output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type,
 				     CUDNN_TENSOR_NCHW, n, c, h, w);
 
     // NOTE: Changing output tensor placement from host to device
@@ -336,19 +346,7 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr,
 
 
   profileEvent("Conv_end", true);
-        #ifdef ERROR_INJECTION_ENABLED
-  if (op_counter >= total_ops) {
-    ERROR("No accuracy flag found \n");
-  }
-  int op_acc = op_accuracies[op_counter];
-  // Skip errorInjection if explicitly requested
-  if (skip_tensors.find(op_counter) != skip_tensors.end()) {
-    op_acc = 0;
-  }
-  void* error_norms = tensorAddError(output, op_acc);
-  add_norms(error_norms, "tensorConv", op_acc);
-  add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc);
-  op_counter++;
-        #endif
+ 
+  
   return new_output;
 }