diff --git a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
index 7ea0c1dce23cf94385df3089c499338bec281b64..c54dd9ef3bbf6fbd8e75cbb4bc488cb215e580ce 100644
--- a/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp
@@ -857,7 +857,7 @@ errs() << "TensorII: " << *TensorII << "\n";
         Args.push_back(TensorII->getOperand(7));
     
         // Create wrapper API runtime function call
-        Constant* wrapper_tensorGroupConvolution;
+        Constant* wrapper_tensorGroupConvolution =
           M->getOrInsertFunction(StringRef("wrapper_tensorGroupConvolution"),
             RtM->getFunction(StringRef("wrapper_tensorGroupConvolution"))->getFunctionType());
         CallInst* CI = CallInst::Create(wrapper_tensorGroupConvolution,
@@ -895,9 +895,9 @@ errs() << "TensorII: " << *TensorII << "\n";
         Args.push_back(TensorII->getOperand(3));
         Args.push_back(TensorII->getOperand(4));
         Args.push_back(TensorII->getOperand(5));
-    
+
         // Create wrapper API runtime function call
-        Constant* wrapper_tensorBatchNorm;
+        Constant* wrapper_tensorBatchNorm =
           M->getOrInsertFunction(StringRef("wrapper_tensorBatchNorm"),
             RtM->getFunction(StringRef("wrapper_tensorBatchNorm"))->getFunctionType());
         CallInst* CI = CallInst::Create(wrapper_tensorBatchNorm,
diff --git a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
index cde03bd6d0ffa9969c785e17fe2f708c75396158..33a54cd0de626113e5cf11e2f6a6928d4fa384eb 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
+++ b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
@@ -3,11 +3,9 @@
 export HPVM_TENSOR_RT_HOME=/home/hsharif3/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/
 export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH
 
-clang++ -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
+clang++ -I/software/cuda-9.1/include -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
 llvm-dis --version
 llvm-dis ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
-cp ${HPVM_TENSOR_RT_HOME}/build/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_runtime.a
-cp ${HPVM_TENSOR_RT_HOME}/build_autotuner/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_autotuner.a
 
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py b/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
deleted file mode 100644
index e3b94082f5be7b83a1598625afd5ef05a0472506..0000000000000000000000000000000000000000
--- a/llvm/projects/hpvm-tensor-rt/build_pldi/table_generator.py
+++ /dev/null
@@ -1,304 +0,0 @@
-import glob
-import os 
-import subprocess
-import shutil 
-import sys
-
-from collections import defaultdict
-
-'''
-FORMAT
-
-** LayerName NumOpsInLayer <cols>
-OpName Col1Val Col2Val ...
-
-** Conv1 1 h2f_time h2f_energy fp32_time fp32_energy f2h_time f2h_energy fp16_perf_time fp16_perf_energy fp16_time fp16_energy
-Conv1 51.8808 97.2844 319.582 601.966 12.81 18.758 388.092 650.649 340.037 590.664
-
-'''
-
-class TableGenerator: 
-
-    __ops_header_delimiter = "#"
-    __table_header_delimter = "**" 
-    __time_col_name = "time" 
-    __energy_col_name = "energy"
-
-    '''
-    Stores all precision conversions used. 
-    '''
-    precision_conversions = frozenset(["h2f", "f2h"]) 
-
-    def __init__(self, dir_path, iters, profiler_binary_name):
-        '''
-        Args:
-            dir_path:               Path of directory containing network binaries
-            iters:                  Number of iterations to run each binary for
-            profiler_binary_name:   Name of offline profiler binary to run 
-        '''
-        self.__dir_path = dir_path
-
-        # Name of the actual directory 
-        self.__network_name = os.path.split(dir_path)[-1]
-
-        self.__iters = iters 
-        self.__profiler_binary_name = profiler_binary_name
-
-        # Path to results directory 
-        self.__results_dir_path = "%s_results" % self.__dir_path
-
-        # Outputted table file
-        self.__table_filename = "%s_tensors.txt" % self.__network_name
-
-		# Nested default dictionary of default dicts
-        self.__table = self.__build_nested_default_dict()
-
-
-    def generate_table(self):
-        '''
-        Generates a table file called <network_name>_tensors.txt in the following 
-        steps:
-        1. Runs the offline profiler against the inputted binaries to generate
-        results files
-        2. Builds an internal table storing all data from the parsed results files
-        the offline profiler generated
-        3. Writes the internal table to <network_name>_tensors.txt file and uses the 
-        <network_name>_ops.txt file as a guideline in terms of row order 
-        '''
-        self.__run_inputted_binaries()
-        self.__build_internal_table()
-        self.__output_table_to_file()
-
-
-    def __run_inputted_binaries(self):
-        '''
-        Invokes the profiler to run all appropriate binaries (must start with the network 
-        name) in the inputted directory. Result files generated by the profiler are 
-        stored in the results file directory and are named <binary_name>.txt. These results
-        files are then parsed in a later step to generate the table
-        '''
-        if not os.path.isdir(self.__dir_path):
-            print("ERROR: Directory %s not found" % self.__dir_path)
-            exit(1)
-
-        try:
-            os.mkdir(self.__results_dir_path)
-        except OSError:
-            if os.path.isdir(self.__results_dir_path):
-                print("Directory already exists. Clearing directory.")
-                for old_file in glob.glob(os.path.join(self.__results_dir_path, "*")):
-                    os.remove(old_file)
-            else:
-                print("ERROR: Directory doesn't exist but failed to create dir")
-
-        for binary_name in os.listdir(self.__dir_path):
-            binary_path = os.path.join(self.__dir_path, binary_name)
-
-            if not self.__should_execute_file(binary_path):
-                continue
-
-            output_file = os.path.join(self.__results_dir_path, binary_name + ".txt")
-            # No stdout/stderr piping needed for now
-            subprocess.Popen([profiler_binary_name, binary_path, str(self.__iters), \
-                        output_file]).communicate()
-
-
-    def __build_internal_table(self):
-        '''
-        Iterates through each results file generated by the runs of the offline
-        profiler and stores the data in a dictionary in the following format:
-            [operation name][approximation type OR conversion type][time/energy]
-        '''
-        for results_file_name in os.listdir(self.__results_dir_path):
-            # Ignore if it's not a results file
-            if results_file_name == self.__table_filename or \
-                        not results_file_name.startswith(self.__network_name):
-                continue
-
-            approx_type = self.__get_approximation_type(results_file_name)
-            results_file = open(os.path.join(self.__results_dir_path, results_file_name), "r")
-
-            for line in results_file:
-                line = line.strip()
-                op_name, total_time, total_energy = self.__parse_tensor_operation_line(line)
-
-                # If the current operation is f2h or h2f  
-                if any(op_name.endswith(prec_conv) for prec_conv in TableGenerator.precision_conversions):
-                    # Get the original operation name (without the f2h/h2f) and the conversion type 
-                    orig_op_name, conversion_type = self.__get_original_operation_name(op_name)
-
-                    if orig_op_name not in self.__table:
-                        print("ERROR: Conversion found but original %s is not in the table" % orig_op_name)
-                        exit(1)
-
-                    # Store f2h and h2f as columns in the row belonging to the original operation
-                    self.__table[orig_op_name][conversion_type][TableGenerator.__time_col_name] = total_time
-                    self.__table[orig_op_name][conversion_type][TableGenerator.__energy_col_name] = total_energy
-
-                # Create a new row in the dictionary
-                else:
-                    self.__table[op_name][approx_type][TableGenerator.__time_col_name] = total_time
-                    self.__table[op_name][approx_type][TableGenerator.__energy_col_name] = total_energy
-            results_file.close()
-
-
-    def __output_table_to_file(self):
-        '''
-        Outputs the internally stored table to a file using the <network_name>_ops.txt file as
-        a guideline in the following steps:
-        1. Opens the ops file and the file to output the table to
-        2. Reads a line from the ops file (guaranteed to be the layers/NML header)
-        3. For each operation in the layer (or 1 operation if the "layer" is a NML), we store the
-        time and the energy
-        '''
-        table_file_path = os.path.join(self.__results_dir_path, self.__table_filename)
-        soc_operations_file_name = os.path.join("/", "home", "nvidia", "soc_simulator", \
-                        "%s_cifar10" % self.__network_name, "%s_ops.txt" % self.__network_name)
-
-        soc_operations_file = open(soc_operations_file_name, "r")
-        table_file = open(table_file_path, "w")
-
-        curr_line = soc_operations_file.readline().strip()
-
-        while curr_line:
-            # First line is always the layers line (#layer_name,num_ops)
-            layer_name, num_ops = self.__parse_layer_info_line(curr_line)
-
-            # List of strings, where each string is a row corresponding to an operation
-            # in the layer
-            ops_in_layer = []
-
-            # Stores a list of elements in the header, which will be joined into a string
-            # The header is only generated for the first operation in the layer
-            # CRITICAL ASSUMPTION: All operations within a layer have the same # columns
-            # or everything breaks bc the header is per layer, not per operation
-            header = [TableGenerator.__table_header_delimter, layer_name, str(num_ops)]
-
-            # Iterate through all operations within the layer 
-            for op_in_layer_count in range(num_ops):
-                # Contains the operation name 
-                curr_line = soc_operations_file.readline().strip()
-
-                # Stores a list of elements that will be joined to make up a row 
-                curr_op = [curr_line]
-                operation_data = self.__table[curr_line]
-
-                # Iterate through time/energy data for each approximation type corresponding
-                # to the current operation
-                for approx_type in operation_data:
-                    op_time = operation_data[approx_type][TableGenerator.__time_col_name]
-                    op_energy = operation_data[approx_type][TableGenerator.__energy_col_name]
-
-                    curr_op.append(op_time)
-                    curr_op.append(op_energy)
-
-                    if op_in_layer_count == 0:
-                        header.append("%s_time" % approx_type)
-                        header.append("%s_energy" % approx_type)
-
-                ops_in_layer.append(' '.join(curr_op))
-
-            # Getting all operation rows and then writing everything because
-            # calls to write() are slow (memory vs time tradeoff)
-            table_file.write("%s\n%s\n" % (' '.join(header), '\n'.join(ops_in_layer)))
-
-            curr_line = soc_operations_file.readline().strip()
-
-
-    def __should_execute_file(self, file_path):
-        '''
-        Checks if the file at the given file path is a binary that should be run
-        by the profiler. Must exist, be a binary, and must start with the network
-        name as per our naming standards.
-
-        Args:
-            file_path:          Path of the file to check 
-        '''
-        return os.path.isfile(file_path) and os.access(file_path, os.X_OK) and \
-                file_path.find(self.__network_name) != -1
-
-
-    def __get_approximation_type(self, results_filename):
-        '''
-        Parses a given results filename for the approximation type. 
-        Format assumption: <network_name>_<approx_type>.txt
-            
-        Args:
-            results_filename:      Name of results file
-
-        Returns:
-            the approximation technique (ex: fp16) 
-        '''
-        approx_type_start_ind = results_filename.find(self.__network_name) \
-                + len(self.__network_name) + 1 # + 1 to account for _ delimiter
-        approx_type_end_ind = results_filename.find(".txt")
-        return results_filename[approx_type_start_ind : approx_type_end_ind] 
-   
-
-    def __parse_tensor_operation_line(self, tensor_op_line):
-        '''
-        Parses a tensor operation line (within a output file from the offline
-        profiler for the operation name, the total time used, and the total
-        energy used
-
-        Args:
-            tensor_op_line:        Tensor operation line from output file
-
-        Returns:
-            operation name
-            total time used
-            total energy used
-        '''
-        line_as_list = tensor_op_line.split(",")
-        return line_as_list[0], line_as_list[1], line_as_list[2] 
-
-
-    def __build_nested_default_dict(self):
-        '''
-        Builds a nested default dictionary with an arbitrary number of levels
-        '''
-        return defaultdict(self.__build_nested_default_dict)
-
-    def __get_original_operation_name(self, op_name):
-        '''
-        Parses an operation name containing _<conversion type> for the original
-        operation name.
-        Format assumption: <original_op_name>_<conversion type>
-
-        Args:
-            op_name:        Name of the operation
-        
-        Returns:
-            the original operation name 
-        '''
-        underscore_ind = op_name.find("_")
-        return op_name[ : underscore_ind], op_name[underscore_ind + 1 : ]
-
-
-    def __parse_layer_info_line(self, layer_info_line): #layer_name,num_ops
-        '''
-        Parses a layer header (from the original ops.txt file) into the layer name
-        and the number of operations
-        Assumed format: #layer_name,num_ops
-
-        Args:
-            layer_info_line:    Line at the beginning of each layer in the ops file
-
-        Returns:
-            layer name
-            number of ops in the layer
-        '''
-        comma_ind = layer_info_line.find(",")
-        return layer_info_line[layer_info_line.find(TableGenerator.__ops_header_delimiter) + 1 : comma_ind], \
-                    int(layer_info_line[comma_ind + 1 : ])
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 4:
-        print("python table_generator.py <binary dir path> <num itrs> <profiler bin path>")
-        exit(1)
-    binary_dir_path = sys.argv[1]
-    num_iters = int(sys.argv[2]) 
-    profiler_binary_name = sys.argv[3]
-    table_gen = TableGenerator(binary_dir_path, num_iters, profiler_binary_name)
-    table_gen.generate_table()
diff --git a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py
index 5f5c28032d721dcf1e77ab52407a165c0251deb2..f1f00f4e285fbf487fee03bfee72dbe1a84ea55a 100644
--- a/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py
+++ b/llvm/projects/hpvm-tensor-rt/code_autogenerators/benchmark_testing_automator.py
@@ -27,17 +27,37 @@ def parse_binary_output(proc_output):
     return avg_time
 
 
-# Input: a list of tuples of benchmark names
-# Can change to input a file containing benchmarks to run 
-def run_benchmarks(builds_dir, output_filename, should_print_bin_output = True):
-    output_file = open(output_filename, "w")
+def get_sorted_binaries(builds_dir):
+    # dict of network names to lists of binaries
+    # list of binaries should be in sorted order (can do that when we run the benchmarks)
+    network_bins = defaultdict(list)
     for bin_name in os.listdir(builds_dir):
         if bin_name.find("profiling") == -1:
             continue
-        output_file.write("%s: %s\n" % (bin_name, \
+        network_name = bin_name[ : bin_name.rfind("_")]
+        network_bins[network_name].append(bin_name)
+    return network_bins
+
+# Input: a list of tuples of benchmark names
+# Can change to input a file containing benchmarks to run 
+def run_benchmarks(sorted_bins, builds_dir, output_filename, should_print_bin_output = False):
+    def get_knob_id(bin_name):
+        return int(bin_name[bin_name.rfind("_") + 1 : ])
+
+    output_file = open(output_filename, "w", buffering = 0)
+    for network_name in sorted_bins:
+        # Sort the binaries in order by knob id
+        sorted_bins[network_name].sort(key = get_knob_id)
+        print("--------------------------------------")
+        print(network_name)
+        # Go through all binaries
+        for bin_name in sorted_bins[network_name]:
+            print(bin_name)
+            output_file.write("%s results\n" % bin_name)
+            output_file.write("%s: %s\n" % (bin_name, \
                 parse_binary_output(run_benchmark(os.path.join(builds_dir, bin_name), \
                 should_print_bin_output))))
-        print(bin_name)
+        print("--------------------------------------\n")
     output_file.close()
 
 
@@ -48,4 +68,5 @@ if __name__ == "__main__":
         print("Usage: python online_benchmark_testing_automator.py <builds dir> <outputs_file_name>")
         exit(1)
     print("Output file name: %s" % sys.argv[2])
-    run_benchmarks(sys.argv[1], sys.argv[2])
+    sorted_bins = get_sorted_binaries(sys.argv[1])
+    run_benchmarks(sorted_bins, sys.argv[1], sys.argv[2])
diff --git a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
index 3e48a094b89ac506cf50f712a0d60b1bac95f75d..89c8da90f8ab740062bd84cdd365baa67311a7a4 100644
--- a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
+++ b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
@@ -8,8 +8,8 @@ define void @_Z13dummyFunctionv() #0 {
 entry:
   %initRT = alloca i8*, align 8
   %cleanRT = alloca i8*, align 8
-  %initApproxhpvmRT = alloca i8*, align 8
-  %cleaApproxhpvmRT = alloca i8*, align 8
+  %initApproxRT = alloca i8*, align 8
+  %cleanApproxRT = alloca i8*, align 8
   %initRTController = alloca i8*, align 8
   %cleanRTController = alloca i8*, align 8
   %request_tensorPtr = alloca i8*, align 8
@@ -44,17 +44,18 @@ entry:
   %ConvLayer = alloca i8*, align 8
   %FCLayer = alloca i8*, align 8
   %ConvLayer2 = alloca i8*, align 8
+  %ConvLayer3 = alloca i8*, align 8
   %FCLayer2 = alloca i8*, align 8
   %AddWrapper = alloca i8*, align 8
   %ReluWrapper = alloca i8*, align 8
   %TanhWrapper = alloca i8*, align 8
   %BatchNormWrapper = alloca i8*, align 8
   %PoolingWrapper = alloca i8*, align 8
-  %SoftmaxWrapper = alloca i8*, align 8
+  %softmaxWrapper = alloca i8*, align 8
   store i8* bitcast (void (i32)* @llvm_hpvm_initTensorRt to i8*), i8** %initRT, align 8
   store i8* bitcast (void ()* @llvm_hpvm_cleanupTensorRt to i8*), i8** %cleanRT, align 8
-  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxhpvmRT, align 8
-  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleaApproxhpvmRT, align 8
+  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxRT, align 8
+  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleanApproxRT, align 8
   store i8* bitcast (void (i8*, i8*)* @llvm_hpvm_initializeRuntimeController to i8*), i8** %initRTController, align 8
   store i8* bitcast (void ()* @llvm_hpvm_clearRuntimeController to i8*), i8** %cleanRTController, align 8
   store i8* bitcast (void (i8*, i32)* @hpvm_request_tensor to i8*), i8** %request_tensorPtr, align 8
@@ -89,13 +90,14 @@ entry:
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, i32, i32, i32, i32, i32, i32, float, float, i32)* @ConvLayer_PROMISE to i8*), i8** %ConvLayer, align 8
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, float, float, i32)* @FCLayer_PROMISE to i8*), i8** %FCLayer, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float)* @wrapper_ConvLayer to i8*), i8** %ConvLayer2, align 8
+  store i8* bitcast (i8* (i8*, i8*, i8*, i32, i32, i32, i32, i32, i32)* @wrapper_tensorGroupConvolution to i8*), i8** %ConvLayer3, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, float, float)* @wrapper_FCLayer to i8*), i8** %FCLayer2, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*)* @wrapper_tensorAdd to i8*), i8** %AddWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorRelu to i8*), i8** %ReluWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorTanh to i8*), i8** %TanhWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i8*, i8*, double)* @wrapper_tensorBatchNorm to i8*), i8** %BatchNormWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i32, i32, i32, i32, i32, i32, i32)* @wrapper_tensorPooling to i8*), i8** %PoolingWrapper, align 8
-  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %SoftmaxWrapper, align 8
+  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %softmaxWrapper, align 8
   ret void
 }
 
@@ -175,6 +177,8 @@ declare i8* @FCLayer_PROMISE(i8*, float, float, i8*, float, float, i8*, float, f
 
 declare i8* @wrapper_ConvLayer(i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float) #1
 
+declare i8* @wrapper_tensorGroupConvolution(i8*, i8*, i8*, i32, i32, i32, i32, i32, i32) #1
+
 declare i8* @wrapper_FCLayer(i8*, i8*, i8*, i8*, i32, float, float) #1
 
 declare i8* @wrapper_tensorAdd(i8*, i8*, i8*) #1
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04336fca2708d5e5d78849e1c12014f5ddbd1ad7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet/op_cost.txt
@@ -0,0 +1,6 @@
+11894784.000000
+39321600.000000
+21233664.000000
+28311552.000000
+18874368.000000
+20480.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5722f202dde469dca94c71dd9c5fc1cd7aa32b
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/alexnet2/op_cost.txt
@@ -0,0 +1,7 @@
+88473.601562
+943718.375000
+471859.187500
+943718.375000
+471859.187500
+943718.375000
+2048.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..74b1b668e2f27f3ddb77dcac7fff9890c70a6f02
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/lenet/op_cost.txt
@@ -0,0 +1,4 @@
+62720.000000
+1003520.000000
+321126.406250
+1024.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673e704b7e37e19c090e98799189a4411bad9f7c
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet/op_cost.txt
@@ -0,0 +1,28 @@
+88473.601562
+29491.199219
+209715.203125
+14745.599609
+209715.203125
+29491.199219
+419430.406250
+7372.799805
+209715.203125
+14745.599609
+419430.406250
+3686.399902
+209715.203125
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+7372.799805
+419430.406250
+1843.199951
+209715.203125
+3686.399902
+419430.406250
+1024.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7266441905a08c1ef1796dec8ee6c05660998378
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/mobilenet_shallow/op_cost.txt
@@ -0,0 +1,8 @@
+265420.812500
+629145.625000
+629145.625000
+1258291.250000
+629145.625000
+1258291.250000
+629145.625000
+6144.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fdba070cfc5eac559c8384306993fb52a1eb2e04
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/resnet/op_cost.txt
@@ -0,0 +1,22 @@
+44236.800781
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+117964.796875
+235929.593750
+13107.200195
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+117964.796875
+235929.593750
+13107.200195
+235929.593750
+235929.593750
+235929.593750
+235929.593750
+64.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5f58ebcc043915d28cf874a1f67e5b2637db1dfc
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar10/op_cost.txt
@@ -0,0 +1,15 @@
+88473.601562
+1887436.750000
+943718.375000
+1887436.750000
+943718.375000
+1887436.750000
+1887436.750000
+943718.375000
+1887436.750000
+1887436.750000
+471859.187500
+471859.187500
+471859.187500
+13107.200195
+256.000000
diff --git a/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c6daad2e2902e3ac821d99ebbe12e21b6428cc7
--- /dev/null
+++ b/llvm/projects/hpvm-tensor-rt/opentuner/data/vgg16_cifar100/op_cost.txt
@@ -0,0 +1,15 @@
+884736.000000
+18874368.000000
+9437184.000000
+18874368.000000
+9437184.000000
+18874368.000000
+18874368.000000
+9437184.000000
+18874368.000000
+18874368.000000
+4718592.000000
+4718592.000000
+4718592.000000
+131072.000000
+25600.000000
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
index b482cef5377e0f879b43f06a7ebbfbe01b39be09..14dc8f20f2111e85e82630cdbcc0c695a39c5ecd 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h
@@ -72,7 +72,9 @@ public:
     FP32,
     FP16,
     PERFORATION,
-//    INPUT_SAMPLING,
+    INPUT_SAMPLING,
+    REDUCTION_SAMPLING,
+//  ADDITIONAL_APPROXIMATION_METHOD
     APPROX_END
   };
 
@@ -91,6 +93,15 @@ public:
     POOL_MEAN,
     POOL_MIN,
     SOFTMAX,
+    FFT,
+    REDUCE,
+    PROJECTIVE_T,
+    MAP1,
+    MAP2,
+    MAP3,
+//    STENCIL,
+//    COSINE_T,
+//  ADDITIONAL_TENSOR_OPERATION
     TENSOR_OP_END
   };
 
@@ -269,6 +280,24 @@ void GPUNodeConfiguration::print() {
       case TENSOR_OP::SOFTMAX :
         DEBUG("softmax");
         break;
+      case TENSOR_OP::FFT :
+        DEBUG("fft");
+        break;
+      case TENSOR_OP::REDUCE :
+        DEBUG("reduce");
+        break;
+      case TENSOR_OP::PROJECTIVE_T :
+        DEBUG("projectiveT");
+        break;
+      case TENSOR_OP::MAP1 :
+        DEBUG("map1");
+        break;
+      case TENSOR_OP::MAP2 :
+        DEBUG("map2");
+        break;
+      case TENSOR_OP::MAP3 :
+        DEBUG("map3");
+        break;
       default :
         ERROR("Unknown tensor operation.");
         break;
@@ -288,6 +317,12 @@ void GPUNodeConfiguration::print() {
         case APPROX::PERFORATION :
           DEBUG("perf");
           break;
+        case APPROX::INPUT_SAMPLING :
+          DEBUG("input_samp");
+          break;
+        case APPROX::REDUCTION_SAMPLING :
+          DEBUG("red_samp");
+          break;
         default:
           ERROR("Unknown approximation option");
           break;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
index 911f42b955a72cb756aadc1fc78231187ef3394e..21c6df7f1749e891dba257bbb1933c3beefb8c4f 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/hpvm-rt-controller.h
@@ -735,6 +735,30 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	  DEBUG ("Found softmax operation\n");
 	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::SOFTMAX);
 	  idx++;
+	} else if (tokens[idx] == "fft") {
+	  DEBUG ("Found fft operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::FFT);
+	  idx++;
+	} else if (tokens[idx] == "reduce") {
+	  DEBUG ("Found reduce operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::REDUCE);
+	  idx++;
+	} else if (tokens[idx] == "projectiveT") {
+	  DEBUG ("Found projectiveT operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T);
+	  idx++;
+	} else if (tokens[idx] == "map1") {
+	  DEBUG ("Found map1 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP1);
+	  idx++;
+	} else if (tokens[idx] == "map2") {
+	  DEBUG ("Found map2 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP2);
+	  idx++;
+	} else if (tokens[idx] == "map3") {
+	  DEBUG ("Found map3 operation\n");
+	  NodeConf->pushNewTensorOperation(GPUNodeConfiguration::TENSOR_OP::MAP3);
+	  idx++;
 	} else /*Not a new operation. This means an approximation option*/
 	  if (tokens[idx] == "fp32") {
 	    DEBUG("Found fp32 option\n");
@@ -756,6 +780,18 @@ void RuntimeController::readConfigurationFile(const char *str) {
 	    DEBUG("perf parameter: %d\n", perf);
         NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::PERFORATION, perf);
           idx += 2;
+        } else if (tokens[idx] == "input_samp") {
+	    DEBUG("Found input_samp option\n");
+        int input_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("input_samp parameter: %d\n", input_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::INPUT_SAMPLING, input_samp);
+          idx += 2;
+        } else if (tokens[idx] == "red_samp") {
+	    DEBUG("Found red_samp option\n");
+        int red_samp = std::stoi(tokens[idx+1]);
+	    DEBUG("red_samp parameter: %d\n", red_samp);
+        NodeConf->pushNewApproximationChoiceForOperation(GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING, red_samp);
+          idx += 2;
         }
 	// TODO: other approximation options handled here
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
index 13ef262bac301e35d7c3e6306a3706dfe79a68a2..282a0cbb68de4f033b46cdc5c4a8ad69aa1f20c0 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_utils.cu
@@ -329,12 +329,18 @@ extern "C"{
   void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){
 
     Tensor* tensor = (Tensor*) tensor_ptr;
-  
-    if(tensor->size_in_bytes != size_in_bytes){
+
+    size_t host_size_in_bytes = tensor->num_elems * 4;
+    //if(tensor->size_in_bytes != size_in_bytes){
+    if(host_size_in_bytes != size_in_bytes){
       ERROR("The destination and source sizes don't match");
     }
   
     std::memcpy(tensor->host_data, data_ptr, size_in_bytes);
+
+    changeTensorPlacement(tensor, HOST);
+
+    tensor->cur_type = float_type;
   }
 
 		      
@@ -428,7 +434,7 @@ extern "C"{
 
 
 
-bool ONLINE_PROFILING = false;
+bool ONLINE_PROFILING = false; // true;
 
 
 void convertToFP16(struct Tensor* tensor){
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
index 4e762ea9894405bb375f518b65c209b4129d9f70..83b4dc9431ee84051def8a0f6850e7f2c194f033 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile
@@ -1,5 +1,6 @@
 DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks
 # NOTE: can configure build directory
+#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/
 HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT)
 
 CC = $(HPVM_BUILD_DIR)/bin/clang++
@@ -15,9 +16,10 @@ APP = alexnet
 TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include
 TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include
 TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a
+PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a
+SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a
 TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a
 
-
 CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH)  -fno-exceptions -ffast-math -std=c++11 -O3
 CCFLAGS += -DDEVICE=CUDNN_TARGET
 LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lOpenCL
@@ -58,15 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_promise.ll -S -o  $(BUILD_DIR)/$(APP)_promise.visc.ll
 	$(OPT) -load LLVMGenVISC.so -genvisc -globaldce  $(BUILD_DIR)/$(APP)_loop.ll -S -o  $(BUILD_DIR)/$(APP)_loop.visc.ll
 	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP).visc.ll  -o  $(BUILD_DIR)/$(APP)_cudnn.bc
-	$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
-	$(OPT) $(VISC_OPTFLAGS)  $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop.bc
+	#$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_promise.bc
+	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll  -o  $(BUILD_DIR)/$(APP)_wrapperapi.bc
+	$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll  -o  $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc
 	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc
-	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
-	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_linked.bc
-	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
-	$(CC) $(BUILD_DIR)/$(APP)_loop_linked.bc $(TENSOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_linked $(LINKER_FLAGS)
-	#$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS)
+	#$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc
+	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc
+	$(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc
+	$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS)
+	#$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS)
+	$(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS)
 
 $(BUILD_DIR):
 	mkdir -p $@
diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
index ee07bdd8f9901f1582d5f7642a2a86c099397a14..d92bc0c45d1115620d529aea4636ece8d3d62127 100644
--- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
+++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp
@@ -9,8 +9,10 @@
 #include <tensorTypes.h> 
 #include <tensorUtils.h> 
 
+
+
 void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 5, 5, 1, 1); 
@@ -18,7 +20,7 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -26,7 +28,7 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_2_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -34,7 +36,7 @@ void var_2_node(void* t1, size_t bytes_t1) {
 }
 
 void var_3_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -42,7 +44,7 @@ void var_3_node(void* t1, size_t bytes_t1) {
 }
 
 void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); 
@@ -50,7 +52,7 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -58,7 +60,7 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_6_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -66,7 +68,7 @@ void var_6_node(void* t1, size_t bytes_t1) {
 }
 
 void var_7_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -74,7 +76,7 @@ void var_7_node(void* t1, size_t bytes_t1) {
 }
 
 void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -82,7 +84,7 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -90,7 +92,7 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_10_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -98,7 +100,7 @@ void var_10_node(void* t1, size_t bytes_t1) {
 }
 
 void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -106,7 +108,7 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -114,7 +116,7 @@ void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_13_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -122,7 +124,7 @@ void var_13_node(void* t1, size_t bytes_t1) {
 }
 
 void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); 
@@ -130,7 +132,7 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -138,7 +140,7 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_16_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_tanh(t1); 
@@ -146,7 +148,7 @@ void var_16_node(void* t1, size_t bytes_t1) {
 }
 
 void var_17_node(void* t1, size_t bytes_t1) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(1, t1, 0); 
 
   void* r = __visc__tensor_pool_max(t1, 2, 2, 0, 0, 2, 2); 
@@ -154,7 +156,7 @@ void var_17_node(void* t1, size_t bytes_t1) {
 }
 
 void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_mul(t1, t2); 
@@ -162,7 +164,7 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) {
 }
 
 void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { 
-  __visc__hint(visc::CUDNN_TARGET); 
+  __visc__hint(visc::PROMISE_TARGET); 
   __visc__attributes(2, t1, t2, 0); 
 
   void *r = __visc__tensor_add(t1, t2); 
@@ -177,6 +179,8 @@ void var_20_node(void* t1, size_t bytes_t1) {
   __visc__return(2, r, (size_t) 0); 
 }
 
+
+
 void root(void* input, size_t input_bytes, 
 	  void* conv2d_1_w, size_t conv2d_1_w_bytes, 
 	  void* conv2d_1_b, size_t conv2d_1_b_bytes, 
@@ -371,9 +375,10 @@ int main(){
 
   std::string dir_prefix = std::string("../../../../../../projects/hpvm-tensor-rt/model_params/alexnet_cifar10_test/");
 
-
+  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  //void* input = readTrainedWeights(input_path.c_str(), 0,5000,3,32,32); 
   std::string labels_path =  dir_prefix + std::string("labels32.bin"); 
-  //uint8_t* labels = readLabels(labels_path.c_str(),10000); 
+  uint8_t* labels = readLabels(labels_path.c_str(),5000); 
   std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
   void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); 
   std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
@@ -404,6 +409,8 @@ int main(){
   __visc__init(); 
   RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); 
 
+  //args->input = input; 
+  //args->input_bytes = 0; 
   args->conv2d_1_w = conv2d_1_w; 
   args->conv2d_1_w_bytes = 0; 
   args->conv2d_1_b = conv2d_1_b; 
@@ -429,48 +436,38 @@ int main(){
   args->dense_1_b = dense_1_b; 
   args->dense_1_b_bytes = 0; 
 
-  int batch_size = 500;
-  int test_input_size = 10000;  
-  int batch_count = test_input_size / batch_size;
-
-  std::string input_path =  dir_prefix + std::string("input.bin"); 
+  int batch_size = 500; 
+  int test_input_size = 10000;
+  int batch_count = test_input_size / batch_size; 
+  
   void* input = create4DTensor(0,nchw,batch_size,3,32,32);
 
-  
   startMemTracking();
-  for (int i = 0; i < batch_count; i++){
+  startProfiling();
 
-    int start = i * batch_size; 
-    int end = (i + 1) * batch_size; 
+  for (int i = 0; i < batch_count; i++){
+  
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
 
     copyInputBatch(input_path.c_str(),start,end,3,32,32, input);
-
-    args->input = input; 
+  
+    args->input = input;
     args->input_bytes = 0; 
-
-    //void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32);
-
-    void* dfg = __visc__launch(0, root, (void*) args); 
+  
+    void* dfg = __visc__launch(0, root, (void*) args);
 
     __visc__wait(dfg); 
+  
+    void *result = static_cast<RootIn*>(args)->input;
+    hpvm_request_tensor(result, 0);
+  
+    llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end);
 
-    void *result = static_cast<RootIn*>(args)->input; 
-    hpvm_request_tensor(result, 0); 
-
-
-    uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); 
-
-    computeAccuracy3(labels, result);
-
-    llvm_hpvm_invokeRtControl2(result, labels);
-      
     freeBatchMemory();
   }
-
-
-  __visc__cleanup();
-
-
+  stopProfiling();
+  __visc__cleanup();  
+  
   return 0; 
-
-} 
+}