diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc
index 219ea430b53af9e34629f731f0a1d6bac20a061b..7c9583f291ea908c4c89a8b56045e06585a4f83a 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/lenet_perf.cc
@@ -97,7 +97,7 @@ void testLenetTanh(){
     //				conv_mode, conv_precision, 2, 2, 1);
 
     void* conv1out = tensorConvSampSim(input, conv1_filter, 2, 2, 1, 1,
-    				       conv_mode, conv_precision, 2, 1);
+    				       conv_mode, conv_precision, 4, 0);
 
     // NOTE: For tensorAdd, the only dimension that MUST match is channels  
     tensorAdd(conv1out, conv1_bias); // NOTE: In place operation
@@ -107,8 +107,12 @@ void testLenetTanh(){
     void* conv1_tanh = tensorTanh(pool1out);
 
     // NOTE: input channels have to match between tensor op inputs and outputs 
-    void* conv2out = tensorConvPerfCuda(conv1_tanh, conv2_filter, 2, 2, 1, 1,
-					conv_mode, conv_precision, 1, 2, 1);
+    //void* conv2out = tensorConvPerfCuda(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+    //				conv_mode, conv_precision, 1, 2, 1);
+
+    void* conv2out = tensorConvSampSim(conv1_tanh, conv2_filter, 2, 2, 1, 1,
+				       conv_mode, conv_precision, 2, 0);
+    
     tensorAdd(conv2out, conv2_bias); // NOTE: In place operation
 
     void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
index 6cc48b0a570e8b2995b75fc5213d7d79431ef6ee..d5a1e903f644c4d27477bac4d8587fb177b58021 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_simulation.h
@@ -238,7 +238,8 @@ void sampleFilterElems(int N,
     int ch = (i % (c * h * w)) / (h * w);
     int n = i / (c * h * w);
 
-    int local_index = row * w + col;
+    //int local_index = row * w + col;
+    int local_index = (ch * (h * w)) + (row * w) + col;
 
     //data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 1.0;