diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
index b97e5beadb7822cce12bdf2ee4d16407cd0483c4..546eb8286390fd5ff8e9bb09bd628a03f14b9f38 100644
--- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
+++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu
@@ -2,10 +2,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file  consists of the custom implementation of software approximations
-// for tensor convolutions. The approximations implemented are feature sampling
-// and perforation for FP32 and FP16 compute precisions.
+//  This file  consists of our CUDA-based implementation for convolution approximations
 //
+//  *Supported Approximations: Perforated Convolutions, Filter Sampling
+//
+//  FP32 Convolution Routine:  `tensorConvApprox`
+//  FP16 Convolution Routine:  `tensorConvApproxHalf2`
+// 
+//  NOTE: These approximations are tuned for NVIDIA Jetson Tx2 device
+//
+//  Author: Akash Kothari
 //===----------------------------------------------------------------------===//
 
 #include "tensor_utils.h"
@@ -209,6 +215,7 @@ __global__ void convToGemmHalfInputNewIrregular2(
     const int W_out, const int V_stride, const int H_stride,
     const int reduced_filter_elem, const int skip_every,
     const int skip_offset) {
+  
   const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id
   const int n = tx / (C * H_out * W_out);               // output image number
   const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan
@@ -1172,10 +1179,9 @@ convToGemmApprox(float *const __restrict__ output,
 }
 
 /// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is
-/// FP32. This routine is invoked by the tuner for tuning approximations for
-/// convolutions.
-///
+/// with the perforation and filter sampling support. The compute precison is FP32.
+/// NOTE: This routine is used only for correctness testing
+/// NOTE: This is NOT the main approximation routine used by HPVM 
 void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
                          int horizontal_pad, int vertical_stride,
                          int horizontal_stride, int conv_mode, int conv_groups,
@@ -1331,8 +1337,6 @@ void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
         vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
         num_filter_elem, c * h * w);
     checkCudaErrors(cudaDeviceSynchronize());
-    // Do the matrix multiplication
-    // Want to multiply convData by filter->gpu_data[f * chan * KH * KW]
 
     float alpha = 1.0f, beta = 0.0f;
     checkCudaErrors(cublasSgemmStridedBatched(
@@ -1345,7 +1349,6 @@ void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad,
     cudaFree(convData);
   }
 
-  // Event("Conv_end"); //, true);
   return new_output;
 }
 
@@ -1364,18 +1367,22 @@ __global__ void switchMatrixFull(int N, int n, int c, int h, int w,
   }
 }
 
-/// This function serves as an API with the custom implementation of convolution
-/// with the perforation and filter sampling support. The compute precison is
-/// FP32.
+  
+/*************   API for Approximation Convolution Implementations  ************/
+
+///  ** API for FP32 Convolution that supports Baseline (No Approx), Perforation, and Filter Sampling **
+/// - Arguments to control Approximation:
+///    `row`: Controls the fraction of rows skipped (Perforation) - (1/row * 100)% rows skipped
+///    `col`: Controls fraction of columns skipped (Perforation) - (1/col * 100)% columns skipped  
+///    `skip_every`: Controls fration of filter elements skipped (Filter Sampling). (1/skip_every * 100)% filter elems skipped
+///    `offset` controls the tensor index at which sampling/perforation starts
 ///
+///   For Baseline convolution pass `row=1` `col=1` `skip_every = 1`
 void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
                        int horizontal_pad, int vertical_stride,
                        int horizontal_stride, int conv_mode, int conv_groups,
                        int row, int col, int skip_every, int offset) {
 
-  //////INFO("*** TensorConvolution approximation \n");
-  // Event("Conv");
-
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
   // FIXME: Current hack to preserve backward compatibilty
@@ -1386,36 +1393,22 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
 
-  ////Event("H2F_start");
   convertToFP32(input);
   convertToFP32(filter);
-  ////Event("H2F_end");
 
   const int n = input->dims.dim_sizes[0];
   const int c = filter->dims.dim_sizes[0]; // number of filters
   const int KH = filter->dims.dim_sizes[2];
   const int KW = filter->dims.dim_sizes[3];
-  const int h =
-      (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
-  const int w =
-      (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride +
-      1;
+  const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1;
+  const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1;
   const int num_filter_elem = KH * KW * input->dims.dim_sizes[1];
-
+  
   Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
                                                 CUDNN_TENSOR_NCHW, n, c, h, w);
   // NOTE: Changing output tensor placement from host to device
   changeTensorPlacement(new_output, DEVICE);
-  ////INFO("batch: %d\n", n);
-  ////INFO("channels: %d\n", input->dims.dim_sizes[1]);
-  ////INFO("num_filters: %d\n", c);
-  ////INFO("kernel height: %d\n", KH);
-  ////INFO("kernel width: %d\n", KW);
-  ////INFO("num_filter_elem: %d\n", num_filter_elem);
-  ////INFO("vertical_stride: %d\n", vertical_stride);
-  ////INFO("horizontal_stride: %d\n", horizontal_stride);
-  ////INFO("output height: %d\n", h);
-  ////INFO("output width: %d\n", w);
+ 
   if (row > 1) {
     const int rem_row = (h - offset) % row > 0;
     const int h_eff = h - ((h - offset) / row) - rem_row;
@@ -1432,8 +1425,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n *
-    /// input->dims.dim_sizes[1] * h_eff * w));
     const int gridSize =
         (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize;
     convToGemmPerfRow<<<gridSize, blockSize>>>(
@@ -1464,7 +1455,7 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     const int w_eff = w - ((w - offset) / col) - rem_col;
 
     Tensor *output = (Tensor *)create4DTensor(
-        (cudnnDataType_t)float_type, // input->data_type,
+        (cudnnDataType_t)float_type, 
         CUDNN_TENSOR_NCHW, n, c, h, w_eff);
 
     // NOTE: Changing output tensor placement from host to device
@@ -1475,8 +1466,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     checkCudaErrors(cudaMalloc(&convData, convDataSize));
 
     const int blockSize = 128;
-    ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n *
-    /// input->dims.dim_sizes[1] * h * w_eff));
     const int gridSize =
         (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize;
 
@@ -1494,7 +1483,7 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
         (float *)filter->gpu_data, num_filter_elem, 0, &beta,
         (float *)output->gpu_data, h * w_eff, c * h * w_eff, n));
 
-    // interpolate
+    // Interpolate
     int blocksize = 128;
     int numBlocks = (n * c * h * w + blocksize - 1) / blocksize;
     approxInterpolateCol<<<numBlocks, blocksize>>>(
@@ -1518,16 +1507,13 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
         cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem));
 
     const int filtBlockSize = 128;
-    ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
-    const int filtGridSize =
-        (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
+    const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
     const float fac = ((float)skip_every) / ((float)skip_every - 1);
-    //////INFO("fac: %f\n", fac);
     const int blockSize = 128;
-    //////INFO("n * h * w : %d\n", (n * h * w ));
     const int gridSize = (n * h * w + blockSize - 1) / blockSize;
+    
     if (!(KH * KW % skip_every)) {
-      // ////INFO("REGULAR FILTERING\n");
+
       createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(
           reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
           reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
@@ -1538,8 +1524,8 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
           input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
           vertical_pad, horizontal_pad, h, w, vertical_stride,
           horizontal_stride, reduced_filter_elem, skip_every, offset);
-    } else {
-      // ////INFO("IRREGULAR FILTERING\n");
+    }
+    else {
       createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(
           reducedFilter, (float *)filter->gpu_data, c, num_filter_elem,
           reduced_filter_elem, skip_every, offset, fac);
@@ -1563,7 +1549,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     cudaFree(reducedFilter);
   } else {
 
-    // INFO("FP32 BASELINE\n");
     Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type,
                                               CUDNN_TENSOR_NCHW, n, c, h, w);
     changeTensorPlacement(output, DEVICE);
@@ -1575,25 +1560,17 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     const int blockSize = 128;
     const int gridSize =
         (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n *
-    /// input->dims.dim_sizes[1] * h * w));
+
     convToGemmFullInput<<<gridSize, blockSize>>>(
         convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1],
         input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
         vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride,
-        skip_every, offset); // num_filter_elem);
+        skip_every, offset); 
+    
     checkCudaErrors(cudaDeviceSynchronize());
 
     float alpha = 1.0f, beta = 0.0f;
-    /*
-    checkCudaErrors(cublasSgemmStridedBatched(cublasHandle,
-                                         CUBLAS_OP_N, CUBLAS_OP_N,
-                                           h * w, c, num_filter_elem,
-                                           &alpha,
-                                           convData, h * w, num_filter_elem * h
-    * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float
-    *)new_output->gpu_data, h * w, c * h * w, n));
-   */
+
     checkCudaErrors(cublasGemmEx(
         cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem,
         &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data,
@@ -1609,7 +1586,6 @@ void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad,
     cudaFree(convData);
   }
 
-  // Event("Conv_end");
   return new_output;
 }
 
@@ -1628,21 +1604,27 @@ __global__ void switchMatrixHalf(int N, int n, int c, int h, int w,
   }
 }
 
-/// This function serves as an API to custom implementation of the
-/// half-precision convolution with the perforation and filter sampling
-/// support.
+
+
+
+///  ** API for FP16 Convolution that supports Baseline (No Approx), Perforation, and Filter Sampling **
+/// - Arguments to control Approximation:
+///    `row`: Controls the fraction of rows skipped (Perforation) - (1/row * 100)% rows skipped
+///    `col`: Controls fraction of columns skipped (Perforation) - (1/col * 100)% columns skipped  
+///    `skip_every`: Controls fration of filter elements skipped (Filter Sampling). (1/skip_every * 100)% filter elems skipped
+///    `offset` controls the tensor index at which sampling/perforation starts
 ///
+///   For Baseline convolution pass `row=1` `col=1` `skip_every = 1`
 void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
                             int horizontal_pad, int vertical_stride,
                             int horizontal_stride, int conv_mode,
                             int conv_groups, int row, int col, int skip_every,
                             int offset) {
 
-  // INFO("*** TensorConvolution half approximation \n");
-  // profileEvent("#Conv");
-
+ 
   Tensor *input = (Tensor *)input_ptr;
   Tensor *filter = (Tensor *)filter_ptr;
+
   // FIXME: Current hack to preserve backward compatibilty
   if (conv_groups == 0) {
     conv_groups = 1;
@@ -1670,18 +1652,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
   Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
                                                 CUDNN_TENSOR_NCHW, n, c, h, w);
   changeTensorPlacement(new_output, DEVICE);
-  // INFO("batch: %d\n", n);
-  // INFO("channels: %d\n", input->dims.dim_sizes[1]);
-  // INFO("num_filters: %d\n", c);
-  // INFO("kernel height: %d\n", KH);
-  // INFO("kernel width: %d\n", KW);
-  // INFO("num_filter_elem: %d\n", num_filter_elem);
-  // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem);
-  // INFO("vertical_stride: %d\n", vertical_stride);
-  // INFO("horizontal_stride: %d\n", horizontal_stride);
-  // INFO("output height: %d\n", h);
-  // INFO("output width: %d\n", w);
-  // INFO("skip_every: %d\n", skip_every);
+
   const __half alf = approx_float_to_half(1.0);
   const __half bet = approx_float_to_half(0.0);
   const __half *alpha_half = &alf;
@@ -1707,7 +1678,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
     const int numInterpolationBlocks =
         (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
     if (h * w <= 64) {
-      // INFO("H *W <= 64\n");
+
       convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(
           convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
           input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
@@ -1730,7 +1701,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
       checkCudaErrors(cudaDeviceSynchronize());
 
     } else {
-      // INFO("H *W > 64\n");
+
       convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(
           convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
           input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
@@ -1773,7 +1744,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
     const int numInterpolationBlocks =
         (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize;
     if (h * w <= 64) {
-      // INFO("H *W <= 64\n");
+
       convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(
           convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
           input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
@@ -1794,8 +1765,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
           (__half *)output_half->gpu_half_data,
           (__half *)new_output->gpu_half_data, col, offset);
       checkCudaErrors(cudaDeviceSynchronize());
-    } else {
-      // INFO("H *W > 64\n");
+    }
+    else {
       convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(
           convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
           input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,
@@ -1836,18 +1807,15 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
         (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize;
     const float fac = ((float)skip_every) / ((float)skip_every - 1);
     const int blockSize = 256;
-    // const int gridSize = (n * h * w + blockSize - 1) / blockSize;
-    // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem));
-    // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem));
+
     const __half alf = approx_float_to_half(1.0);
     const __half bet = approx_float_to_half(0.0);
     const __half *alpha_half = &alf;
     const __half *beta_half = &bet;
-    if (c * num_filter_elem <
-        500000) { // 250) {//c * reduced_filter_elem < 150000) {
+    if (c * num_filter_elem <  500000) { 
       if (!(KH * KW % skip_every)) {
-        // INFO("---REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+
+	createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
             reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
             reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
             fac);
@@ -1862,16 +1830,16 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
             w, vertical_stride, horizontal_stride, reduced_filter_elem,
             skip_every, offset);
       } else {
-        // INFO("---IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+
+	createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
             reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
             reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
 
         const int gridSize =
             (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize;
-        // convToGemmHalfInputIrregular
-        convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
+
+	convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(
             convData, (__half *)input->gpu_half_data, n,
             input->dims.dim_sizes[1], input->dims.dim_sizes[2],
             input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h,
@@ -1891,8 +1859,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
       changeTensorPlacement(output_half, DEVICE);
 
       if (!(KH * KW % skip_every)) {
-        // INFO("REGULAR FILTERING\n");
-        createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
+
+	createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(
             reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
             reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset,
             fac);
@@ -1907,8 +1875,8 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
             w, vertical_stride, horizontal_stride, reduced_filter_elem,
             skip_every, offset);
       } else {
-        // INFO("IRREGULAR FILTERING\n");
-        createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
+
+	createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(
             reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem,
             reduced_filter_elem, skip_every, offset, fac);
         checkCudaErrors(cudaDeviceSynchronize());
@@ -1943,7 +1911,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
     cudaFree(convData);
     cudaFree(reducedFilter);
   } else {
-    // INFO("FP16 BASELINE\n");
+
     Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type,
                                               CUDNN_TENSOR_NCHW, n, c, h, w);
 
@@ -1955,7 +1923,7 @@ void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad,
     const int blockSize = 256;
     const int gridSize =
         (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize;
-    // convToGemmHalf
+
     convToGemmHalfInputNew<<<gridSize, blockSize>>>(
         convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1],
         input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW,