merging

4be5f914 · Hashim Sharif · 2a3e6579 · 0ffea7f3 · 4be5f914 · 4be5f914
Commit 4be5f914 authored 5 years ago by Hashim Sharif
--- a/llvm/projects/pred_tuner/models/torch/shufflenet.py
+++ b/llvm/projects/pred_tuner/models/torch/shufflenet.py
+'''ShuffleNet in PyTorch.
+
+See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ShuffleBlock(nn.Module):
+    def __init__(self, groups):
+        super(ShuffleBlock, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+        N,C,H,W = x.size()
+        g = self.groups
+        return x.view(N,g,C//g,H,W).permute(0,2,1,3,4).reshape(N,C,H,W)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, in_planes, out_planes, stride, groups):
+        super(Bottleneck, self).__init__()
+        self.stride = stride
+
+        mid_planes = out_planes/4
+        g = 1 if in_planes==24 else groups
+        self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False)
+        self.bn1 = nn.BatchNorm2d(mid_planes)
+        self.shuffle1 = ShuffleBlock(groups=g)
+        self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_planes)
+        self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False)
+        self.bn3 = nn.BatchNorm2d(out_planes)
+
+        self.shortcut = nn.Sequential()
+        if stride == 2:
+            self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.shuffle1(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        res = self.shortcut(x)
+        out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res)
+        return out
+
+
+class ShuffleNet(nn.Module):
+    def __init__(self, cfg):
+        super(ShuffleNet, self).__init__()
+        out_planes = cfg['out_planes']
+        num_blocks = cfg['num_blocks']
+        groups = cfg['groups']
+
+        self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(24)
+        self.in_planes = 24
+        self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups)
+        self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups)
+        self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups)
+        self.linear = nn.Linear(out_planes[2], 10)
+
+    def _make_layer(self, out_planes, num_blocks, groups):
+        layers = []
+        for i in range(num_blocks):
+            stride = 2 if i == 0 else 1
+            cat_planes = self.in_planes if i == 0 else 0
+            layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups))
+            self.in_planes = out_planes
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ShuffleNetG2():
+    cfg = {
+        'out_planes': [200,400,800],
+        'num_blocks': [4,8,4],
+        'groups': 2
+    }
+    return ShuffleNet(cfg)
+
+def ShuffleNetG3():
+    cfg = {
+        'out_planes': [240,480,960],
+        'num_blocks': [4,8,4],
+        'groups': 3
+    }
+    return ShuffleNet(cfg)
+
+
+def test():
+    net = ShuffleNetG2()
+    x = torch.randn(1,3,32,32)
+    y = net(x)
+    print(y)
+
+# test()
--- a/llvm/projects/pred_tuner/models/torch/shufflenetv2.py
+++ b/llvm/projects/pred_tuner/models/torch/shufflenetv2.py
+'''ShuffleNetV2 in PyTorch.
+
+See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ShuffleBlock(nn.Module):
+    def __init__(self, groups=2):
+        super(ShuffleBlock, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+        N, C, H, W = x.size()
+        g = self.groups
+        return x.view(N, g, C//g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W)
+
+
+class SplitBlock(nn.Module):
+    def __init__(self, ratio):
+        super(SplitBlock, self).__init__()
+        self.ratio = ratio
+
+    def forward(self, x):
+        c = int(x.size(1) * self.ratio)
+        return x[:, :c, :, :], x[:, c:, :, :]
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_channels, split_ratio=0.5):
+        super(BasicBlock, self).__init__()
+        self.split = SplitBlock(split_ratio)
+        in_channels = int(in_channels * split_ratio)
+        self.conv1 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv2 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)
+        self.bn2 = nn.BatchNorm2d(in_channels)
+        self.conv3 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(in_channels)
+        self.shuffle = ShuffleBlock()
+
+    def forward(self, x):
+        x1, x2 = self.split(x)
+        out = F.relu(self.bn1(self.conv1(x2)))
+        out = self.bn2(self.conv2(out))
+        out = F.relu(self.bn3(self.conv3(out)))
+        out = torch.cat([x1, out], 1)
+        out = self.shuffle(out)
+        return out
+
+
+class DownBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(DownBlock, self).__init__()
+        mid_channels = out_channels // 2
+        # left
+        self.conv1 = nn.Conv2d(in_channels, in_channels,
+                               kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv2 = nn.Conv2d(in_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(mid_channels)
+        # right
+        self.conv3 = nn.Conv2d(in_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(mid_channels)
+        self.conv4 = nn.Conv2d(mid_channels, mid_channels,
+                               kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False)
+        self.bn4 = nn.BatchNorm2d(mid_channels)
+        self.conv5 = nn.Conv2d(mid_channels, mid_channels,
+                               kernel_size=1, bias=False)
+        self.bn5 = nn.BatchNorm2d(mid_channels)
+
+        self.shuffle = ShuffleBlock()
+
+    def forward(self, x):
+        # left
+        out1 = self.bn1(self.conv1(x))
+        out1 = F.relu(self.bn2(self.conv2(out1)))
+        # right
+        out2 = F.relu(self.bn3(self.conv3(x)))
+        out2 = self.bn4(self.conv4(out2))
+        out2 = F.relu(self.bn5(self.conv5(out2)))
+        # concat
+        out = torch.cat([out1, out2], 1)
+        out = self.shuffle(out)
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(self, net_size):
+        super(ShuffleNetV2, self).__init__()
+        out_channels = configs[net_size]['out_channels']
+        num_blocks = configs[net_size]['num_blocks']
+
+        self.conv1 = nn.Conv2d(3, 24, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(24)
+        self.in_channels = 24
+        self.layer1 = self._make_layer(out_channels[0], num_blocks[0])
+        self.layer2 = self._make_layer(out_channels[1], num_blocks[1])
+        self.layer3 = self._make_layer(out_channels[2], num_blocks[2])
+        self.conv2 = nn.Conv2d(out_channels[2], out_channels[3],
+                               kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_channels[3])
+        self.linear = nn.Linear(out_channels[3], 10)
+
+    def _make_layer(self, out_channels, num_blocks):
+        layers = [DownBlock(self.in_channels, out_channels)]
+        for i in range(num_blocks):
+            layers.append(BasicBlock(out_channels))
+            self.in_channels = out_channels
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        # out = F.max_pool2d(out, 3, stride=2, padding=1)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+configs = {
+    0.5: {
+        'out_channels': (48, 96, 192, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+
+    1: {
+        'out_channels': (116, 232, 464, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+    1.5: {
+        'out_channels': (176, 352, 704, 1024),
+        'num_blocks': (3, 7, 3)
+    },
+    2: {
+        'out_channels': (224, 488, 976, 2048),
+        'num_blocks': (3, 7, 3)
+    }
+}
+
+
+def test():
+    net = ShuffleNetV2(net_size=0.5)
+    x = torch.randn(3, 3, 32, 32)
+    y = net(x)
+    print(y.shape)
+
+
+# test()
--- a/llvm/projects/pred_tuner/models/torch/vgg.py
+++ b/llvm/projects/pred_tuner/models/torch/vgg.py
+"""VGG11/13/16/19 in Pytorch."""
+import torch.nn as nn
+from models.hpvm import HPVMConvBundle
+
+
+cfg = {
+    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class VGG(nn.Module):
+    def __init__(self, vgg_name):
+        super(VGG, self).__init__()
+        self.features = self._make_layers(cfg[vgg_name])
+        self.classifier = nn.Linear(512, 10)
+
+    def forward(self, x):
+        out = self.features(x)
+        out = out.view(out.size(0), -1)
+        out = self.classifier(out)
+        return out
+
+    @staticmethod
+    def _make_layers(config):
+        layers = []
+        in_channels = 3
+        for x in config:
+            if x == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                layers += [HPVMConvBundle(in_channels, x, kernel_size=3, padding=1),
+                           nn.BatchNorm2d(x),
+                           nn.ReLU(inplace=True)]
+                in_channels = x
+        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
+        return nn.Sequential(*layers)
--- a/llvm/projects/pred_tuner/run_tuner.py
+++ b/llvm/projects/pred_tuner/run_tuner.py
+#!/usr/bin/env python
+#
+# Development-time Tuner with Algorithmic Approximations:
+# Approximations: Perforation, Sampling with varying knobs for rate, skip offset
+import copy
+import logging
+import os
+import shutil
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import numpy as np
+import opentuner
+from opentuner import ConfigurationManipulator, EnumParameter, MeasurementInterface
+from opentuner.measurement.inputmanager import FixedInputManager
+from opentuner.search.objective import ThresholdAccuracyMinimizeTime
+from opentuner.tuningrunmain import TuningRunMain
+from torch.nn import Module
+from tqdm import tqdm
+
+from exp import Benchmark, ConfigMeasurer, ExpState, TuningTime, batch_id, bench_tuner_data, is_dev_time
+from models import get_all_output, networks, QoS
+from toolkit import ConfigT
+from toolkit.estimators import WeightedLinearQoSEstimator
+from utils import Config, config, reapply_last_config
+
+msg_logger = logging.getLogger(__name__)
+use_proxy = False
+n_promise_valid_runs = 30
+confidence_level = 0.95
+
+
+def init_proxy(ni: ConfigMeasurer, pickle_path: Path):
+    def acc_crit(inputs_):
+        return ni.get_qos(inputs_, ni.val_loader)
+
+    def threshold_eval(inputs_):
+        accs = np.array([acc_crit(x) for x in inputs_])
+        return ni.val_qos - accs.mean() < 3.0
+
+    def run_model(net: Module):
+        return get_all_output(net, ni.val_loader)
+
+    return WeightedLinearQoSEstimator(
+        ni.nas, run_model, acc_crit, threshold_eval, confidence_level, storage=pickle_path
+    )
+
+
+class Timer:
+    def __init__(self, timer_state: TuningTime, timer_name: str):
+        self.timer_state = timer_state
+        self.name = timer_name
+        self.start = None
+
+    def __enter__(self):
+        self.start = time.time()
+        return self
+
+    def __exit__(self, *args):
+        end = time.time()
+        interval = end - self.start
+        self.timer_state.add_timer(self.name, interval)
+
+
+class TunerDriver:
+    def __init__(self, bench: Benchmark):
+        self.bench = bench
+        msg_logger.info(f"Tuning for model {self.bench.model_name}")
+        # Initialize folder.
+        self._init_folder(bench)
+        # Take a snapshot of current code.
+        self.take_code_snapshot()
+        # Initialize network information and qos thresholds
+        self.net_info = ConfigMeasurer.init_from_bench(self.bench)
+        qoses = self.net_info.val_qos, self.net_info.test_qos
+        qos_type = self.net_info.val_qos.__class__
+        self.tuner_thres = qos_type.suggested_tuner_thresholds(self.net_info.val_qos)
+        self.val_thres = qos_type.suggested_val_threshold(self.net_info.val_qos)
+        self.test_thres = qos_type.suggested_test_threshold(self.net_info.test_qos)
+        # Tuner states.
+        self.states = ExpState(bench, qos_type, qoses)
+        # Current # of iteration. `ProxyTuner` will use this.
+        self.run_id, self.iter = 0, 0
+        # Initialize proxy.
+        if use_proxy:
+            self.proxy = init_proxy(self.net_info, self.bench.result_dir / 'proxy.pkl')
+        else:
+            self.proxy = None
+
+    @staticmethod
+    def _init_folder(bench: Benchmark):
+        def remove_file_or_folder(path: Path):
+            if path.is_dir():
+                shutil.rmtree(child)
+            elif path.is_file():
+                path.unlink()  # Removes file despite the surprising name
+
+        pickle_path = bench.result_dir / 'proxy.pkl'
+        # Remove everything in result folder except pickle file
+        if bench.result_dir.is_dir():
+            msg_logger.warning(f"!Cleaning existing result dir = {bench.result_dir}")
+            for child in bench.result_dir.glob('*'):
+                if child == pickle_path:
+                    continue
+                msg_logger.info(f"  !Removing {child}")
+                remove_file_or_folder(child)
+        # Create result folder if it doesn't exist
+        if not bench.result_dir.is_dir():
+            msg_logger.info(f"Creating output directory = {bench.result_dir}")
+            os.makedirs(bench.result_dir)
+
+    def get_default_args(self):
+        args = opentuner.default_argparser().parse_args()
+        args.database = f"opentuner.db/{batch_id}.db"
+        args.test_limit = self.bench.autotuner_runs
+        parent = Path(args.database).parent
+        if not parent.is_dir():
+            os.makedirs(parent, exist_ok=True)
+        return args
+
+    def tuner_exec(self):
+        # Get default opentuner args
+        args = self.get_default_args()
+        # Start tuning for each threshold
+        for i, thres in enumerate(self.tuner_thres):
+            with Timer(self.states.timers, f"tuning_{i}"):
+                msg_logger.info(
+                    f"Tuning goal: qos >= {thres}; keeping configs with qos >= {self.val_thres}"
+                )
+                tuner = ProxyTuner(args, self, thres, self.val_thres)
+                # TuningRunMain.__init__ initializes its own logger, so we'll reapply our settings.
+                tuning_main = TuningRunMain(tuner, args)
+                reapply_last_config()
+                # Unleash the tuner!
+                tuning_main.main()
+                # Remove tuner progress bar
+                tuner.pbar.close()
+                self.run_id += 1
+                self.iter = 0
+        # Postprocess configs
+        self.process_configs()
+
+    def calibrate_write_configs(self, configs: List[Config], is_test_set: bool):
+        write_to = self.states.tested_configs if is_test_set else self.states.validated_configs
+        gold_acc = self.net_info.test_qos if is_test_set else self.net_info.val_qos
+        for cfg in tqdm(configs, leave=False):
+            cfg = copy.deepcopy(cfg)
+            cfg: Config
+            flags = {k: v for k, v in enumerate(cfg.flags)}
+            measured_acc, confidence = self.net_info.actual_measure(
+                flags, cfg.total_runs, is_test_set, threshold=self.val_thres
+            )
+            prev_acc = cfg.avg_qos
+            cfg.update_acc(measured_acc, confidence, gold_acc)
+            new_acc = cfg.avg_qos
+            msg_logger.debug(f"{prev_acc} (mean) -> {new_acc} (mean)")
+            write_to.append(cfg)
+        write_to.finalize_dump()
+
+    @staticmethod
+    def filter_configs(
+            validation: List[Config], test: List[Config],
+            vali_threshold: QoS, test_threshold: QoS
+    ) -> Tuple[List[Config], List[Config]]:
+        # Filter validation and test set by their respective thresholds
+        filtered_validation = [
+            c for c in validation if c.avg_loss <= vali_threshold
+        ]
+        filtered_test = [
+            c for c in test if c.avg_loss <= test_threshold
+        ]
+        # Test configs also need to be a subset of validation configs.
+        name_to_filtered = {x.fname: x for x in filtered_test}
+        intersect_names = set(list(name_to_filtered.keys())).intersection(
+            set((x.fname for x in filtered_validation))
+        )
+        filtered_test_ = [name_to_filtered[fname] for fname in intersect_names]
+        return filtered_validation, filtered_test_
+
+    def process_configs(self):
+        # Finalize all configs because tuning is done.
+        # (this may not do anything now but will in the future)
+        self.states.all_configs.finalize_dump()
+        all_configs = self.states.all_configs.configs
+        # Pre-filter configs by a wide pareto margin
+        filtered_configs = config.is_pareto_efficient(all_configs, ratio=0.05, n_min=50, n_max=50)
+        msg_logger.info(f"Prefilter yields {len(filtered_configs)} configs from {len(all_configs)}")
+        self.states.filtered_configs.finalize_dump(with_configs=filtered_configs)
+        # Calibrate prefiltered configs (validation step)
+        with Timer(self.states.timers, "validate"):
+            self.calibrate_write_configs(filtered_configs, is_test_set=False)
+            validated_configs = self.states.validated_configs.configs
+        # Calibrate prefiltered configs on test set (test step)
+        with Timer(self.states.timers, "test"):
+            self.calibrate_write_configs(filtered_configs, is_test_set=True)
+            tested_configs = self.states.tested_configs.configs
+        # Filter valid and test set configs by thresholds
+        valid_configs, test_configs = self.filter_configs(
+            validated_configs, tested_configs, self.val_thres, self.test_thres
+        )
+        self.states.valid_configs.finalize_dump(valid_configs)
+        self.states.test_configs.finalize_dump(test_configs)
+        # Finalize data input and plot everything.
+        self.states.finalize_plot()
+
+    def take_code_snapshot(self):
+        import git
+        msg_logger.info(f"Taking git snapshot")
+        ref_dir = self.bench.result_dir / "references"
+        os.mkdir(ref_dir)
+        # Write current git commit (SHA id)
+        repo = git.Repo(search_parent_directories=True)
+        sha = repo.head.object.hexsha
+        msg_logger.info(f"Current code is at commit {sha}")
+        with (ref_dir / 'git_commit.txt').open('w') as f:
+            f.write(sha)
+        # Also put all outstanding code change in a diff file.
+        # This way changes in all git-tracked files are captured.
+        t = repo.head.commit.tree
+        with (ref_dir / 'diff.txt').open('w') as f:
+            f.write(repo.git.diff(t))
+
+    def make_config_name(self) -> str:
+        return f"{self.bench.model_name}_{self.run_id}_{self.iter}"
+
+    def get_accuracy(self, cfg: ConfigT) -> Tuple[QoS, QoS, int]:
+        has_promise_flags = set(cfg.values()).intersection(set(range(1, 7 + 1)))
+        config_validation_runs = n_promise_valid_runs if has_promise_flags else 1
+        if use_proxy:
+            mean_acc, confidence_acc = self.net_info.proxy_estimate(cfg, self.proxy)
+            assert has_promise_flags or (mean_acc == confidence_acc)
+        else:
+            mean_acc, _ = self.net_info.actual_measure(cfg, 1, is_test_set=False)
+            confidence_acc = mean_acc
+        return mean_acc, confidence_acc, config_validation_runs
+
+
+class ProxyTuner(MeasurementInterface):
+    def __init__(self, args, driver: TunerDriver, tuner_thres: QoS, accept_thres: QoS):
+        self.tuner_driver = driver
+        self.model_info = driver.net_info
+        self.bench = driver.bench
+        self.tuner_thres = tuner_thres
+        self.all_configs = driver.states.all_configs
+        self.pbar = tqdm(total=args.test_limit, leave=False)
+        objective = ThresholdAccuracyMinimizeTime(tuner_thres.to_scalar())
+        input_manager = FixedInputManager(size=driver.bench.get_n_layers())
+        super(ProxyTuner, self).__init__(
+            args, program_name=self.bench.model_name,
+            input_manager=input_manager, objective=objective
+        )
+        self.accept_thres = accept_thres
+
+    def manipulator(self) -> ConfigurationManipulator:
+        """Define the search space by creating a ConfigurationManipulator."""
+        manipulator = ConfigurationManipulator()
+        for ext_layer_id, knobs in self.model_info.get_knobs().items():
+            manipulator.add_parameter(EnumParameter(ext_layer_id, knobs))
+        return manipulator
+
+    def seed_configurations(self):
+        """Provide baseline config as seed if model uses seed."""
+        return [self.bench.get_baseline_config(not is_dev_time)] if self.bench.use_seed else []
+
+    def run(self, desired_result, input_, limit):
+        """Run a given configuration then return performance and accuracy."""
+        cfg: ConfigT = desired_result.configuration.data
+        # get_accuracy gives estimation of mean accuracy and 95% confident accuracy
+        mean_acc, confident_acc, n_runs = self.tuner_driver.get_accuracy(cfg)
+        # getConfigCost returns the cost associated with the selected configuration
+        total_comps, speedup = self.bench.compute_config_cost(cfg)
+        Result = opentuner.resultsdb.models.Result()
+        Result.time = total_comps
+        # Convert QoS to scalar, because opentuner does not support custom comparable datatype
+        Result.accuracy = confident_acc.to_scalar(relative_to=self.tuner_thres)
+
+        # If accuracy is acceptable, write this config
+        if confident_acc > self.accept_thres:
+            config_name = self.tuner_driver.make_config_name()
+            cfg_values = [cfg[layer] for layer in sorted(cfg.keys())]
+            writing_config = Config(
+                mean_acc, self.model_info.val_qos, config_name, cfg_values,
+                n_runs, 95.0, total_comps, speedup
+            )
+            self.all_configs.append(writing_config)
+            msg_logger.debug(
+                f"Config chosen with accuracy (mean) = {mean_acc}, (95%) = {confident_acc} "
+                f"and speedup = {speedup}"
+            )
+        self.tuner_driver.iter += 1
+        self.pbar.update()
+        return Result
+
+    def save_final_config(self, configuration):
+        """Print final configuration."""
+        msg_logger.info(f"Final configuration {configuration.data}")
+        msg_logger.info("Done with Autotuning run")
+
+
+if __name__ == '__main__':
+    assert set(networks.keys()).issubset(set(bench_tuner_data.keys()))
+    for network in ('alexnet2_hpvm',):
+        bench_: Benchmark = bench_tuner_data[network]
+        TunerDriver(bench_).tuner_exec()
--- a/llvm/projects/pred_tuner/tests/data/1_1_output.json
+++ b/llvm/projects/pred_tuner/tests/data/1_1_output.json
--- a/llvm/projects/pred_tuner/tests/data/3_3_output.json
+++ b/llvm/projects/pred_tuner/tests/data/3_3_output.json
+{
+  "('0', '0', '1', '1', '2', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "26.000000,26.000000,26.000000,26.000000,",
+    "ConvApprox": "26.000000,26.000000,26.000000,26.000000,",
+    "ConvApproxHalf2": "26.000000,26.000000,26.000000,26.000000,"
+  },
+  "('0', '0', '1', '1', '2', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "56.000000,56.000000,56.000000,56.000000,",
+    "ConvApprox": "56.000000,56.000000,56.000000,56.000000,",
+    "ConvApproxHalf2": "56.000000,56.000000,56.000000,56.000000,"
+  },
+  "('0', '0', '1', '1', '3', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "39.000000,39.000000,39.000000,39.000000,",
+    "ConvApprox": "39.000000,39.000000,39.000000,39.000000,",
+    "ConvApproxHalf2": "39.000000,39.000000,39.000000,39.000000,"
+  },
+  "('0', '0', '1', '1', '3', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "42.000000,42.000000,42.000000,42.000000,",
+    "ConvApprox": "42.000000,42.000000,42.000000,42.000000,",
+    "ConvApproxHalf2": "42.000000,42.000000,42.000000,42.000000,"
+  },
+  "('0', '0', '1', '1', '4', '0')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "36.000000,36.000000,36.000000,36.000000,",
+    "ConvApprox": "36.000000,36.000000,36.000000,36.000000,",
+    "ConvApproxHalf2": "35.968750,35.968750,35.968750,35.968750,"
+  },
+  "('0', '0', '1', '1', '4', '1')": {
+    "tensorConvolution": "41.000000,41.000000,41.000000,41.000000,",
+    "Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "FP16_Baseline": "41.000000,41.000000,41.000000,41.000000,",
+    "ConvSampSim": "45.333336,45.333336,45.333336,45.333336,",
+    "ConvApprox": "45.333336,45.333336,45.333336,45.333336,",
+    "ConvApproxHalf2": "45.312500,45.312500,45.312500,45.312500,"
+  },
+  "('1', '1', '1', '1', '2', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
+    "ConvApprox": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,",
+    "ConvApproxHalf2": "12.000000,18.000000,18.000000,12.000000,18.000000,26.000000,26.000000,18.000000,18.000000,26.000000,26.000000,18.000000,12.000000,18.000000,18.000000,12.000000,"
+  },
+  "('1', '1', '1', '1', '2', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
+    "ConvApprox": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,",
+    "ConvApproxHalf2": "24.000000,36.000000,36.000000,24.000000,36.000000,56.000000,56.000000,36.000000,36.000000,56.000000,56.000000,36.000000,24.000000,36.000000,36.000000,24.000000,"
+  },
+  "('1', '1', '1', '1', '3', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,25.500000,39.000000,39.000000,25.500000,25.500000,39.000000,39.000000,25.500000,18.000000,27.000000,27.000000,18.000000,"
+  },
+  "('1', '1', '1', '1', '3', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApprox": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,27.000000,18.000000,28.500000,42.000000,42.000000,27.000000,28.500000,42.000000,42.000000,27.000000,18.000000,27.000000,27.000000,18.000000,"
+  },
+  "('1', '1', '1', '1', '4', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
+    "ConvApprox": "16.000000,22.666666,22.666666,13.333333,25.333334,36.000000,36.000000,22.666668,25.333334,36.000000,36.000000,22.666668,18.666666,25.333334,25.333334,16.000000,",
+    "ConvApproxHalf2": "16.000000,22.671875,22.671875,13.328125,25.328125,35.968750,35.968750,22.656250,25.328125,35.968750,35.968750,22.656250,18.671875,25.328125,25.328125,16.000000,"
+  },
+  "('1', '1', '1', '1', '4', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,18.000000,27.000000,41.000000,41.000000,27.000000,27.000000,41.000000,41.000000,27.000000,18.000000,27.000000,27.000000,18.000000,",
+    "ConvSampSim": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
+    "ConvApprox": "18.666668,29.333332,29.333332,20.000000,29.333332,45.333336,45.333336,29.333332,29.333332,45.333336,45.333336,29.333332,20.000000,29.333332,29.333332,18.666668,",
+    "ConvApproxHalf2": "18.656250,29.343750,29.343750,20.000000,29.328125,45.312500,45.312500,29.343750,29.328125,45.312500,45.312500,29.343750,20.000000,29.328125,29.328125,18.656250,"
+  },
+  "('1', '1', '2', '2', '2', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "12.000000,18.000000,18.000000,26.000000,",
+    "ConvApprox": "12.000000,18.000000,18.000000,26.000000,",
+    "ConvApproxHalf2": "12.000000,18.000000,18.000000,26.000000,"
+  },
+  "('1', '1', '2', '2', '2', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "24.000000,36.000000,36.000000,56.000000,",
+    "ConvApprox": "24.000000,36.000000,36.000000,56.000000,",
+    "ConvApproxHalf2": "24.000000,36.000000,36.000000,56.000000,"
+  },
+  "('1', '1', '2', '2', '3', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.000000,27.000000,25.500000,39.000000,",
+    "ConvApprox": "18.000000,27.000000,25.500000,39.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,25.500000,39.000000,"
+  },
+  "('1', '1', '2', '2', '3', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.000000,27.000000,28.500000,42.000000,",
+    "ConvApprox": "18.000000,27.000000,28.500000,42.000000,",
+    "ConvApproxHalf2": "18.000000,27.000000,28.500000,42.000000,"
+  },
+  "('1', '1', '2', '2', '4', '0')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "16.000000,22.666666,25.333334,36.000000,",
+    "ConvApprox": "16.000000,22.666666,25.333334,36.000000,",
+    "ConvApproxHalf2": "16.000000,22.671875,25.328125,35.968750,"
+  },
+  "('1', '1', '2', '2', '4', '1')": {
+    "tensorConvolution": "18.000000,27.000000,27.000000,41.000000,",
+    "Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "FP16_Baseline": "18.000000,27.000000,27.000000,41.000000,",
+    "ConvSampSim": "18.666668,29.333332,29.333332,45.333336,",
+    "ConvApprox": "18.666668,29.333332,29.333332,45.333336,",
+    "ConvApproxHalf2": "18.656250,29.343750,29.328125,45.312500,"
+  }
+}
\ No newline at end of file
--- a/llvm/projects/pred_tuner/tests/data/promise.json
+++ b/llvm/projects/pred_tuner/tests/data/promise.json
--- a/llvm/projects/pred_tuner/tests/data/quantization.json
+++ b/llvm/projects/pred_tuner/tests/data/quantization.json
--- a/llvm/projects/pred_tuner/tests/promise.py
+++ b/llvm/projects/pred_tuner/tests/promise.py
--- a/llvm/projects/pred_tuner/tests/resnet50.py
+++ b/llvm/projects/pred_tuner/tests/resnet50.py
+from toolkit import ModuleIndexer, NetApproxSelector
+from utils import compute_accuracy, init_by_name, run_concat_output
+
+
+def float_eq(f1, f2):
+    return abs(f1 - f2) < 1e-5
+
+
+def main():
+    baseline, testloader, _, shapes = init_by_name('resnet50_imagenet_hpvm')
+    baseline_dag = ModuleIndexer(baseline)
+    nas = NetApproxSelector(baseline_dag)
+    # baseline
+    baseline_output = run_concat_output(baseline_dag.module, testloader)
+    baseline_acc = compute_accuracy(baseline_output, testloader)
+    assert float_eq(baseline_acc, 0.773)
+    # {13: 242} -> 75.5
+    approx1 = nas.apply_approx_by_config({82: 242})
+    acc1 = compute_accuracy(run_concat_output(approx1.module, testloader), testloader)
+    assert float_eq(acc1, 0.755)
+    # {13: 242, 17: 247} -> 74.6
+    approx2 = nas.apply_approx_by_config({82: 242, 108: 247})
+    acc2 = compute_accuracy(run_concat_output(approx2.module, testloader), testloader)
+    assert float_eq(acc2, 0.746)
+    # {9: 237, 13: 242, 17: 247} -> 74.1
+    approx3 = nas.apply_approx_by_config({55: 237, 82: 242, 108: 247})
+    acc3 = compute_accuracy(run_concat_output(approx3.module, testloader), testloader)
+    assert float_eq(acc3, 0.741)
+    print("Accuracy test passed.")
+
+
+if __name__ == '__main__':
+    main()
--- a/llvm/projects/pred_tuner/tests/sampling.py
+++ b/llvm/projects/pred_tuner/tests/sampling.py
--- a/llvm/projects/pred_tuner/toolkit/__init__.py
+++ b/llvm/projects/pred_tuner/toolkit/__init__.py
--- a/llvm/projects/pred_tuner/toolkit/approxdnn.py
+++ b/llvm/projects/pred_tuner/toolkit/approxdnn.py
--- a/llvm/projects/pred_tuner/toolkit/estimators.py
+++ b/llvm/projects/pred_tuner/toolkit/estimators.py
--- a/llvm/projects/pred_tuner/toolkit/indexing.py
+++ b/llvm/projects/pred_tuner/toolkit/indexing.py
--- a/llvm/projects/pred_tuner/toolkit/transform.py
+++ b/llvm/projects/pred_tuner/toolkit/transform.py
--- a/llvm/projects/pred_tuner/utils/__init__.py
+++ b/llvm/projects/pred_tuner/utils/__init__.py
+from .config import Config
+from .logging import config_pylogger, reapply_last_config
+from .utils import device, get_knob_config_file, get_tensorrt_dir, gpu_mem_mb
--- a/llvm/projects/pred_tuner/utils/benchmarks.json
+++ b/llvm/projects/pred_tuner/utils/benchmarks.json
--- a/llvm/projects/pred_tuner/utils/config.py
+++ b/llvm/projects/pred_tuner/utils/config.py
--- a/llvm/projects/pred_tuner/utils/logging.py
+++ b/llvm/projects/pred_tuner/utils/logging.py