diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..2da9be9ac4ecfec68edac79f118cfe0be5e99862
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "examples/stand_count/pytorch-ssd"]
+	path = examples/stand_count/pytorch-ssd
+	url = git@github.com:qfgaohao/pytorch-ssd.git
diff --git a/README.md b/README.md
index 41c48098915389597067dccba0c3ddb02c608155..2ec57bbf2e55fe7e61edf4da212cbb944cb62c42 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,42 @@
 # Autotuning and Predictive Autotuning
 
-Performs autotuning on program approximation knobs using an error-predictive proxy in place of the
-original program, to greatly speedup autotuning while getting results comparable in quality.
+`predtuner` performs autotuning on program approximation knobs using an error-predictive proxy
+in place of the original program, to greatly speedup autotuning while getting results
+comparable in quality.
 
 Work in progress.
 
 ## Requirements
 
-Prerequisite packages are listed in `./env.yaml`. Conda is the validated and recommended way to set
-up a working environment. If you're using conda, do
+`pip` is needed for installing this package. At the root directory of this repo, do:
 
 ```bash
-conda env create -n predtuner -f env.yaml
-conda activate predtuner
+pip install -e .
 ```
+
+`-e` can be omitted if you don't intend to modify the code in this package.
+
+## Getting Started
+
+The documentation page contains a full tutorial.
+Build the documentation by:
+
+```bash
+pip install sphinx sphinx_rtd_theme sphinx_autodoc_typehints
+cd doc
+make html
+```
+
+The documentation page will be created as `doc/build/html/index.html`.
+You can open this in the browser and browse to "Getting Started" section.
+
+### Model Data for Example / Testing
+
+`predtuner` contains 10 demo models which are also used in tests.
+
+- Download and extract [this](https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u/view?usp=sharing) file containing all 10 models, for testing purposes.
+- The "Getting Started" example on the documentation page only uses VGG16-CIFAR10.
+  If you don't need the other models, get the data for VGG16-CIFAR10
+  [here](https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing).
+
+In either case, there should be a `model_params/` folder at the root of repo after extraction.
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 75381aa720fdee717d20db115ecbc74e73deff4c..5d8472a5a224b6677df4f080bf6309bff6bae016 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -1,2 +1,164 @@
 Getting Started
 ===================
+
+This guide can help you start working with PredTuner.
+
+Installation
+------------
+
+Install PredTuner from source using `pip`:
+
+.. code-block:: shell
+
+   pip install -e .
+
+PredTuner will also be available on PyPi in the future after we publish the first release.
+
+Tuning a PyTorch DNN
+--------------------
+
+PredTuner can tune any user-defined application,
+but it is optimized for tuning DNN applications defined in PyTorch.
+
+We will use models predefined in PredTuner for demonstration purposes.
+Download pretrained VGG16 model parameters and CIFAR10 dataset from `here
+<https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_.
+After extraction, there should be a `model_params/` folder in current directory.
+
+Load the tuning and test subsets of CIFAR10 dataset, and create a pretrained VGG16 model:
+
+.. code-block:: python
+  
+  from pathlib import Path
+  import predtuner as pt
+  from predtuner.model_zoo import CIFAR, VGG16Cifar10
+
+  prefix = Path("model_params/vgg16_cifar10")
+  tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin")
+  tune_loader = DataLoader(tune_set, batch_size=500)
+  test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin")
+  test_loader = DataLoader(test_set, batch_size=500)
+
+  module = VGG16Cifar10()
+  module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar"))
+
+PredTuner provides a logging mechanism.
+While not required, it's recommended that you set up the logger output into a file:
+
+.. code-block:: python
+
+  msg_logger = pt.config_pylogger(output_dir="vgg16_cifar10/", verbose=True)
+
+For each tuning task, both a tuning dataset and a test dataset is required.
+The tuning dataset is used to evaluate the accuracy of application in the autotuning stage,
+while the test dataset is used to evaluate configurations found in autotuning.
+This is similar to the split between training and validation set in machine learning tasks.
+In this case, both tuning and test datasets contain 5000 images.
+
+Create an instance of `TorchApp` for tuning PyTorch DNN:
+
+.. code-block:: python
+
+  app = pt.TorchApp(
+    "TestTorchApp",  # Application name -- can be anything
+    module,
+    tune_loader,
+    test_loader,
+    knobs=pt.get_knobs_from_file(),
+    tensor_to_qos=pt.accuracy,
+    model_storage_folder="vgg16_cifar10/",
+  )
+
+PredTuner provides `TorchApp`, which is specialized for the use scenario of tuning PyTorch DNNs.
+In addition, two more functions from PredTuner are used:
+
+`pt.accuracy` is the *classification accuracy* metric,
+which receives the probability distribution output from the VGG16 model,
+compare it to the groundtruth in the dataset,
+and returns a scalar between 0 and 100 for the classification accuracy
+
+`pt.get_knobs_from_file()` returns a set of approximations preloaded in PredTuner,
+which are applied to `torch.nn.Conv2d` layers.
+See ??? for these approximations and how to define custom approximations.
+
+Now we can obtain a tuner object from the application and start tuning.
+We will keep configurations that don't exceed 3% loss of accuracy,
+but encourage the tuner to find configurations with loss of accuracy below 2.1%.
+
+.. code-block:: python
+
+  tuner = app.get_tuner()
+  tuner.tune(
+    max_iter=500,
+    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
+    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
+    take_best_n=50,
+    cost_model="cost_linear",  # Use linear cost predictor
+  )
+
+**QoS** (quality of service) is a general term for the quality of application after approximations are applied;
+e.g., here it refers to the accuracy of DNN over given datasets.
+We will be using the term QoS throughout the tutorials.
+
+`max_iter` defines the number of iterations to use in autotuning.
+Within 500 iterations, PredTuner should find about 200 valid configurations.
+PredTuner will also automatically mark out `Pareto-optimal
+<https://en.wikipedia.org/wiki/Pareto_efficiency>`_
+configurations.
+These are called "best" configurations (`tuner.best_configs`),
+in contrast to "valid" configurations which are the configurations that satisfy our accuracy requirements
+(`tuner.kept_configs`).
+`take_best_n` allows taking some extra close-optimal configurations in addition to Pareto-optimal ones.
+
+500 iterations is for demonstration; in practice,
+at least 10000 iterations are necessary on VGG16-sized models to converge to a set of good configurations.
+Depending on hardware performance, this tuning should take several minutes to several tens of minutes.
+
+Saving Tuning Results
+---------------------
+
+Now the `tuner` object holds the tuning results,
+we can export it into a json file,
+and visualize all configurations in a figure:
+
+.. code-block:: python
+
+  tuner.dump_configs("vgg16_cifar10/configs.json", best_only=False)
+  fig = tuner.plot_configs(show_qos_loss=True)
+  fig.savefig("vgg16_cifar10/configs.png")
+
+The generated figure should look like this:
+
+.. image:: tuning_result.png
+
+where the blue points shows the QoS and speedup of all valid configurations,
+and the "best" configurations are marked out in orange.
+
+Autotuning with a QoS Model
+-------------------------------------
+
+The previous tuning session shown above is already slow, 
+and will be much slower with larger models, more iterations, and multiple tuning thresholds.
+Instead, we can use a *QoS prediction model* which predicts the QoS,
+with some inaccuracies, but much faster than running the application.
+To do that, simply use the argument `qos_model` when calling `tuner.tune()`:
+
+.. code-block:: python
+
+  tuner = app.get_tuner()
+  tuner.tune(
+    max_iter=500,
+    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
+    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
+    take_best_n=50,
+    cost_model="cost_linear",  # Use linear cost predictor
+    qos_model="qos_p1"
+  )
+
+The QoS model will first undergo a initialization stage (takes a bit of time),
+when it learns about the behavior of each knob on each operator (DNN layer).
+Because the configurations will end up with predicted QoS values after tuning,
+this will add a *validation* stage at the end of tuning where the QoS of best configurations are empirically measured,
+and the bad ones are removed.
diff --git a/doc/index.rst b/doc/index.rst
index 6866a98b3d779063329219cd2c49f8b2c6feb41d..b79d28a42bd60549b7e2921b29854948e0aece48 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -9,27 +9,21 @@ PredTuner performs autotuning on approximation choices for a program
 using an error-predictive proxy instead of executing the program,
 to greatly speedup autotuning while getting results of comparable quality.
 
-PredTuner is a contribution of [ApproxTuner]
-(https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations).
-
-Short-term Goals
-- Measure accuracy impact of approximations
-- Obtain a tuned, approximated CNN in <5 lines of code
-- Easy to manage multiple approximation configs
-- Easy to load and manage prior tuning results
-- Flexible retraining support
-Possible Long-term Goals
-- High-performance implementations of approximate layers
-- Allow users to register their own approximations
-- Support for other frameworks: TF, ONNX, JAX
+PredTuner is a main component of `ApproxTuner
+<https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations>`_.
 
-Documentation
--------------
 
-.. only:: html
+Solution for Efficient Approximation Autotuning
+-----------------------------------------------
+
+- Start a tuning session in 10 lines of code
+- Deep integration with PyTorch for DNN supports
+- Multiple levels of APIs for generality and ease-of-use
+- Effective accuracy prediction models
+- Easily store and visualize tuning results in many formats
 
-    :Release: |version|
-    :Date: |today|
+Documentation
+-------------
 
 .. toctree::
    :maxdepth: 1
@@ -41,6 +35,3 @@ Indices and tables
 ------------------
 
 * :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-* :ref:`glossary`
diff --git a/doc/tuning_result.png b/doc/tuning_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..6210102982aefa78a796a7a8593fe766e802dbf9
Binary files /dev/null and b/doc/tuning_result.png differ
diff --git a/examples/stand_count/metric.py b/examples/stand_count/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..101b0792dc29de78b66afe877b56ed91212b9362
--- /dev/null
+++ b/examples/stand_count/metric.py
@@ -0,0 +1,38 @@
+import numpy as np
+import torch
+from mean_average_precision import MeanAveragePrecision2d
+
+
+def calculate_mAP(gt_boxes, pred_boxes, pred_scores, style="pascal"):
+    if style == "pascal":
+        settings = {
+            "iou_thresholds": 0.5,
+            "recall_thresholds": np.arange(0.0, 1.1, 0.1),
+        }
+    elif style == "coco":
+        settings = {
+            "iou_thresholds": np.arange(0.5, 1.0, 0.05),
+            "recall_thresholds": np.arange(0.0, 1.01, 0.01),
+            "mpolicy": "soft",
+        }
+    else:
+        raise ValueError(f"Unrecognized style {style}")
+
+    map_calc = MeanAveragePrecision2d(num_classes=1)
+    torch2np = lambda x: x.detach().cpu().numpy()
+    for gt_boxes_, pred_boxes_, pred_scores_ in zip(gt_boxes, pred_boxes, pred_scores):
+        # Both groundtruth and predicted needs to be ndarray.
+        # Format for groundtruth: [xmin, ymin, xmax, ymax, class_id, difficult, crowd]
+        # Format for predicted: [xmin, ymin, xmax, ymax, class_id, confidence]
+        n_gt, n_pred = len(gt_boxes_), len(pred_boxes_)
+        gt_np = np.zeros((n_gt, 7))
+        gt_np[:, :4] = torch2np(gt_boxes_)
+        gt_np[:, 4] = 0  # Class ID
+        pred_np = np.zeros((n_pred, 6))
+        pred_np[:, :4] = torch2np(pred_boxes_)
+        pred_np[:, 4] = 0  # Class ID
+        pred_np[:, 5] = torch2np(pred_scores_)
+        map_calc.add(pred_np, gt_np)
+    # Calculate mAP
+    mAP = map_calc.value(**settings)["mAP"]
+    return torch.tensor(mAP)
diff --git a/examples/stand_count/pytorch-ssd b/examples/stand_count/pytorch-ssd
new file mode 160000
index 0000000000000000000000000000000000000000..f61ab424d09bf3d4bb3925693579ac0a92541b0d
--- /dev/null
+++ b/examples/stand_count/pytorch-ssd
@@ -0,0 +1 @@
+Subproject commit f61ab424d09bf3d4bb3925693579ac0a92541b0d
diff --git a/examples/stand_count/ssd.py b/examples/stand_count/ssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfca965ca1846fd7843ca83ad3fa749a75d0f1ab
--- /dev/null
+++ b/examples/stand_count/ssd.py
@@ -0,0 +1,211 @@
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch.nn.modules.module import Module
+from torch.utils.data.dataloader import default_collate
+
+from .vision.nn.multibox_loss import MultiboxLoss
+from .vision.ssd.ssd import SSD, MatchPrior
+from .vision.utils import box_utils
+
+# common across ssd configs:
+center_variance = 0.1
+size_variance = 0.2
+
+
+class SSDWrapper(Module):
+    def __init__(
+        self,
+        net: SSD,
+        size: int,
+        image_mean: np.ndarray,
+        image_std: float,
+        priors: torch.Tensor,
+    ) -> None:
+        Module.__init__(self)
+        self.net = net.eval()
+        self.input_shape = 1, 3, size, size
+        mean = torch.as_tensor(image_mean).unsqueeze(-1).unsqueeze(-1)
+        self.register_buffer("mean", mean)
+        self.transform = lambda img: (F.interpolate(img, size) - self.mean) / image_std
+        self.register_buffer("priors", priors)
+        self.pprocessor = BoxPostprocessor()
+        self._scripting = False
+
+    def step0(self, image: torch.Tensor):
+        scores, locations = self.net.forward(self.transform(image))
+        return scores, locations
+
+    def step1(self, scores: torch.Tensor, locations: torch.Tensor):
+        confidences = F.softmax(scores, dim=2)
+        locations = box_utils.convert_locations_to_boxes(
+            locations,
+            self.priors,
+            center_variance,
+            size_variance,
+        )
+        boxes = box_utils.center_form_to_corner_form(locations)
+        return confidences, boxes
+
+    def step2(self, confidences: torch.Tensor, boxes: torch.Tensor):
+        postprocessed = [
+            self.pprocessor(image_scores, image_boxes)
+            for image_scores, image_boxes in zip(confidences, boxes)
+        ]
+        boxes, labels, confidences = zip(*postprocessed)
+        return boxes, labels, confidences
+
+    def forward(self, image: torch.Tensor):
+        scores, locations = self.step0(image)
+        if self.training:
+            return scores, locations
+        confidences, boxes = self.step1(scores, locations)
+        if self._scripting:
+            return confidences, boxes
+        boxes, labels, confidences = self.step2(confidences, boxes)
+        return boxes, labels, confidences, (scores, locations)
+
+
+def get_mobilenetv2_lite(prefix: str) -> SSDWrapper:
+    from .vision.ssd.config import mobilenetv1_ssd_config as config
+    from .vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite
+
+    ssd_model = create_mobilenetv2_ssd_lite(
+        2, onnx_compatible=True, is_test=False
+    ).eval()
+    ssd_model.init_from_pretrained_ssd(Path(prefix) / "mb2-ssd-lite-mp-0_686.pth")
+    wrapper = SSDWrapper(
+        ssd_model, config.image_size, config.image_mean, config.image_std, config.priors
+    )
+    return wrapper
+
+
+def get_vgg16(prefix: str) -> SSDWrapper:
+    from .vision.ssd.config import vgg_ssd_config as config
+    from .vision.ssd.vgg_ssd import create_vgg_ssd
+
+    ssd_model = create_vgg_ssd(2, is_test=False).eval()
+    ssd_model.init_from_pretrained_ssd(Path(prefix) / "vgg16-ssd-mp-0_7726.pth")
+    wrapper = SSDWrapper(
+        ssd_model, config.image_size, config.image_mean, config.image_std, config.priors
+    )
+    return wrapper
+
+
+class LossAndMAP:
+    def __init__(self, priors: torch.Tensor, device: Union[str, torch.device]) -> None:
+        priors = priors.cpu()
+        self.loss_fn = MultiboxLoss(
+            priors,
+            iou_threshold=0.5,
+            neg_pos_ratio=3,
+            center_variance=0.1,
+            size_variance=0.2,
+            device=device,
+        )
+        self.dataset_target_tr = MatchPrior(priors, center_variance, size_variance, 0.5)
+        self.device = device
+
+    def __call__(self, model_output, targets):
+        if len(model_output) == 2:
+            # Training outputs.
+            scores, locations = model_output
+            boxes = confidences = []
+        else:
+            # Evaluation outputs.
+            boxes, _, confidences, (scores, locations) = model_output
+        matched_labels, matched_boxes, gt_boxes = self._process_targets(targets)
+        reg_loss, cls_loss = self.loss_fn(
+            scores, locations, matched_labels, matched_boxes
+        )
+        return reg_loss, cls_loss, gt_boxes, boxes, confidences
+
+    def avg_reg_loss(self, loss_and_map):
+        reg_losses = list(zip(*loss_and_map))[0]
+        return torch.stack(reg_losses, 0).mean()
+
+    def avg_cls_loss(self, loss_and_map):
+        cls_losses = list(zip(*loss_and_map))[1]
+        return torch.stack(cls_losses, 0).mean()
+
+    def all_losses(self, loss_and_map):
+        return self.avg_reg_loss(loss_and_map) + self.avg_cls_loss(loss_and_map)
+
+    def combine_map(self, loss_and_map):
+        from .metric import calculate_mAP
+
+        flatten = lambda xss: [x for xs in xss for x in xs]
+        _, _, gt_boxes, boxes, confidences = zip(*loss_and_map)
+        gt_boxes = flatten(gt_boxes)
+        boxes, confidences = flatten(boxes), flatten(confidences)
+        return -calculate_mAP(gt_boxes, boxes, confidences)
+
+    def _process_targets(self, targets):
+        gt_boxes, matched_boxes, matched_labels = [], [], []
+        for t in targets:
+            this_boxes, this_labels = t["boxes"], t["labels"]
+            gt_boxes.append(this_boxes)
+            this_boxes_, this_labels_ = self.dataset_target_tr(
+                this_boxes.cpu(), this_labels.cpu()
+            )
+            matched_boxes.append(this_boxes_)
+            matched_labels.append(this_labels_)
+        matched_boxes = default_collate(matched_boxes).to(self.device)
+        matched_labels = default_collate(matched_labels).to(self.device)
+        return matched_labels, matched_boxes, gt_boxes
+
+
+class BoxPostprocessor(Module):
+    def __init__(
+        self,
+        nms_method=None,
+        iou_threshold=0.45,
+        filter_threshold=0.50,
+        candidate_size=200,
+        sigma=0.5,
+    ):
+        super().__init__()
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+        self.sigma = sigma
+
+    def forward(self, scores, boxes, top_k=-1, prob_threshold=None):
+        cpu_device = torch.device("cpu")
+        prob_threshold = prob_threshold or self.filter_threshold
+        # this version of nms is slower on GPU, so we move data to CPU.
+        boxes = boxes.to(cpu_device)
+        scores = scores.to(cpu_device)
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(
+                box_probs,
+                self.nms_method,
+                score_threshold=prob_threshold,
+                iou_threshold=self.iou_threshold,
+                sigma=self.sigma,
+                top_k=top_k,
+                candidate_size=self.candidate_size,
+            )
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.zeros(0, 4), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        return (
+            picked_box_probs[:, :4],
+            torch.tensor(picked_labels),
+            picked_box_probs[:, 4],
+        )  # boxes, labels, scores
diff --git a/examples/stand_count/vision b/examples/stand_count/vision
new file mode 120000
index 0000000000000000000000000000000000000000..c69be8baf28e36e160cc6935b5cb768b7754b6a7
--- /dev/null
+++ b/examples/stand_count/vision
@@ -0,0 +1 @@
+pytorch-ssd/vision
\ No newline at end of file
diff --git a/examples/tune_vgg16_cifar10.py b/examples/tune_vgg16_cifar10.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8e5a8a019f988fe578c7983d0cd0c60e6ee108
--- /dev/null
+++ b/examples/tune_vgg16_cifar10.py
@@ -0,0 +1,55 @@
+import site
+from pathlib import Path
+
+import torch
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Subset
+
+site.addsitedir(Path(__file__).parent.parent.absolute().as_posix())
+from predtuner import TorchApp, accuracy, config_pylogger, get_knobs_from_file
+from predtuner.model_zoo import CIFAR, VGG16Cifar10
+
+# Set up logger to put log file in /tmp
+msg_logger = config_pylogger(output_dir="/tmp", verbose=True)
+
+# Load "tuning" dataset and "test" dataset,
+# and use only the first 500 images from each dataset as an example
+# TODO: you should use all (5000) images for actual tuning.
+prefix = Path("model_params/vgg16_cifar10")
+tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin")
+tune_loader = DataLoader(tune_set, batch_size=500)
+test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin")
+test_loader = DataLoader(test_set, batch_size=500)
+
+# Load checkpoint for VGG16 (CIFAR10)
+module = VGG16Cifar10()
+module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar"))
+app = TorchApp(
+    "TestTorchApp",  # name -- can be anything
+    module,
+    tune_loader,
+    test_loader,
+    get_knobs_from_file(),  # default knobs -- see "predtuner/approxes/default_approx_params.json"
+    accuracy,  # the QoS metric to use -- classification accuracy
+    # Where to serialize prediction models if they are used
+    # For example, if you use p1 (see below), this will leave you a
+    # tuner_results/vgg16_cifar10/p1.pkl
+    # which can be quickly reloaded the next time you do tuning with
+    model_storage_folder="tuner_results/vgg16_cifar10",
+)
+# This is how to measure baseline accuracy -- {} means no approximation
+baseline, _ = app.measure_qos_cost({}, False)
+# Get a tuner object and start tuning!
+tuner = app.get_tuner()
+tuner.tune(
+    max_iter=500,  # TODO: In practice, use at least 5000, or 10000
+    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
+    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
+    cost_model="cost_linear",  # Use linear performance predictor
+    qos_model="qos_p1",  # Use P1 QoS predictor
+)
+# Save configs here when you're done
+tuner.dump_configs("tuner_results/vgg16_cifar10_configs.json")
+fig = tuner.plot_configs(show_qos_loss=True)
+fig.savefig("tuner_results/vgg16_cifar10_configs.png")
\ No newline at end of file
diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py
index b420d8deaf35ad3ac25f8d3a6ef507997c9d3116..06717bd15011564f33a555931e30e3bcb2b34f7a 100644
--- a/predtuner/approxapp.py
+++ b/predtuner/approxapp.py
@@ -116,10 +116,6 @@ class Config:
     @property
     def speedup(self):
         return 1 / self.cost
-        
-    @property
-    def qos_speedup(self):
-        return self.qos, self.speedup
 
 
 T = TypeVar("T", bound=Config)
@@ -151,7 +147,7 @@ class ApproxTuner(Generic[T]):
         is_threshold_relative: bool = False,
         take_best_n: Optional[int] = None,
         test_configs: bool = True,
-        **kwargs
+        app_kwargs: dict = None
         # TODO: more parameters + opentuner param forwarding
     ) -> List[T]:
         from opentuner.tuningrunmain import TuningRunMain
@@ -173,7 +169,7 @@ class ApproxTuner(Generic[T]):
             qos_tuner_threshold,
             qos_keep_threshold,
             is_threshold_relative,
-            self._get_app_kwargs(**kwargs),
+            app_kwargs or {},
         )
         assert self.keep_threshold is not None
         trm = TuningRunMain(tuner, opentuner_args)
@@ -206,27 +202,41 @@ class ApproxTuner(Generic[T]):
             len(self.best_configs),
         )
         if test_configs:
-            msg_logger.info("Checking configurations on test inputs")
-            self.test_configs_(self.best_configs)
+            msg_logger.info("Calibrating configurations on test inputs")
+            self.best_configs = self.test_configs(self.best_configs)
         return self.best_configs
 
-    def test_configs_(self, configs: List[T]):
+    def test_configs(self, configs: List[Config]):
+        from copy import deepcopy
+
         from tqdm import tqdm
 
+        assert self.keep_threshold is not None
+        if not configs:
+            return []
+        ret_configs = []
+        total_error = 0
         for cfg in tqdm(configs, leave=False):
-            cfg: T
-            if cfg.test_qos is not None:
-                continue
+            cfg = deepcopy(cfg)
+            assert cfg.test_qos is None
             cfg.test_qos, _ = self.app.measure_qos_cost(cfg.knobs, True)
             msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.test_qos} (mean)")
+            total_error += abs(cfg.qos - cfg.test_qos)
+            if cfg.test_qos > self.keep_threshold:
+                ret_configs.append(cfg)
+            else:
+                msg_logger.debug("Config removed")
+        mean_err = total_error / len(configs)
+        msg_logger.info("QoS mean abs difference of calibration: %f", mean_err)
+        return ret_configs
 
     @staticmethod
     def take_best_configs(configs: List[T], n: Optional[int] = None) -> List[T]:
-        points = np.array([c.qos_speedup for c in configs])
+        points = np.array([(c.qos, c.speedup) for c in configs])
         taken_idx = is_pareto_efficient(points, take_n=n)
         return [configs[i] for i in taken_idx]
 
-    def dump_configs(self, filepath: PathLike):
+    def dump_configs(self, filepath: PathLike, best_only: bool = True):
         import os
 
         from jsonpickle import encode
@@ -237,20 +247,27 @@ class ApproxTuner(Generic[T]):
             )
         filepath = Path(filepath)
         os.makedirs(filepath.parent, exist_ok=True)
+        confs = self.best_configs if best_only else self.kept_configs
         with filepath.open("w") as f:
-            f.write(encode(self.best_configs, indent=2))
+            f.write(encode(confs, indent=2))
 
     def plot_configs(
-        self, show_qos_loss: bool = False, connect_best_points: bool = False
+        self,
+        show_qos_loss: bool = False,
+        connect_best_points: bool = False,
+        use_test_qos: bool = False,
     ) -> plt.Figure:
         if not self.tuned:
             raise RuntimeError(
                 f"No tuning session has been run; call self.tune() first."
             )
 
+        def qos_speedup(conf):
+            return conf.test_qos if use_test_qos else conf.qos, conf.speedup
+
         def get_points(confs):
             sorted_points = np.array(
-                sorted([c.qos_speedup for c in confs], key=lambda p: p[0])
+                sorted([qos_speedup(c) for c in confs], key=lambda p: p[0])
             ).T
             if show_qos_loss:
                 sorted_points[0] = self.baseline_qos - sorted_points[0]
@@ -297,9 +314,6 @@ class ApproxTuner(Generic[T]):
             **app_kwargs,
         )
 
-    def _get_app_kwargs(self, **kwargs):
-        return {}
-
     @classmethod
     def _get_config_class(cls) -> Type[Config]:
         return Config
diff --git a/predtuner/modeledapp.py b/predtuner/modeledapp.py
index adeb53383a81d0d03db9642e0dfde17896462ee9..a5d904a6a4234be6a609afad16def14795f260b8 100644
--- a/predtuner/modeledapp.py
+++ b/predtuner/modeledapp.py
@@ -5,6 +5,7 @@ import pickle
 from pathlib import Path
 from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
 
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import torch
@@ -192,6 +193,8 @@ class QoSModelP1(IQoSModel):
         qos_metric: Callable[[torch.Tensor], float],
         storage: PathLike = None,
     ) -> None:
+        from torch.nn.functional import softmax
+
         super().__init__()
         self.app = app
         self.output_f = tensor_output_getter
@@ -387,32 +390,89 @@ class ApproxModeledTuner(ApproxTuner):
             qos_keep_threshold=qos_keep_threshold,
             is_threshold_relative=is_threshold_relative,
             take_best_n=take_best_n,
-            test_configs=test_configs,
-            cost_model=cost_model,
-            qos_model=qos_model,
+            test_configs=False,  # Test configs below by ourselves
+            app_kwargs={"cost_model": cost_model, "qos_model": qos_model},
         )
         if validate_configs is None and qos_model != "none":
             msg_logger.info(
                 'Validating configurations due to using qos model "%s"', qos_model
             )
-            self.validate_configs_(self.best_configs)
+            self.best_configs = self._update_configs(self.best_configs, False)
         elif validate_configs:
             msg_logger.info("Validating configurations as user requested")
-            self.validate_configs_(self.best_configs)
+            self.best_configs = self._update_configs(self.best_configs, False)
+        if test_configs:
+            msg_logger.info("Calibrating configurations on test inputs")
+            self.best_configs = self._update_configs(self.best_configs, True)
         return ret
 
-    def validate_configs_(self, configs: List[ValConfig]):
+    def _update_configs(self, configs: List[ValConfig], test_mode: bool):
+        from copy import deepcopy
+
         from tqdm import tqdm
 
+        assert self.keep_threshold is not None
+        if not configs:
+            msg_logger.info("No configurations found.")
+            return []
+        ret_configs = []
+        total_error = 0
         for cfg in tqdm(configs, leave=False):
-            cfg: ValConfig
-            if cfg.validated_qos is not None:
-                continue
-            cfg.validated_qos, _ = self.app.measure_qos_cost(cfg.knobs, False)
-            msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {cfg.test_qos} (mean)")
+            cfg = deepcopy(cfg)
+            qos, _ = self.app.measure_qos_cost(cfg.knobs, test_mode)
+            if test_mode:
+                assert cfg.test_qos is None
+                cfg.test_qos = qos
+                msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {qos} (mean)")
+            else:
+                assert cfg.validated_qos is None
+                cfg.validated_qos = qos
+                msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {qos} (mean)")
+            total_error += abs(cfg.qos - qos)
+            if qos > self.keep_threshold:
+                ret_configs.append(cfg)
+            else:
+                msg_logger.debug("Config removed")
+        mean_err = total_error / len(configs)
+        if test_mode:
+            msg_logger.info("QoS mean abs difference of calibration: %f", mean_err)
+        else:
+            msg_logger.info("QoS mean abs difference of validation: %f", mean_err)
+        msg_logger.info("%d of %d configs remain", len(ret_configs), len(configs))
+        return ret_configs
+
+    def plot_configs(
+        self, show_qos_loss: bool = False, connect_best_points: bool = False
+    ) -> plt.Figure:
+        if not self.tuned:
+            raise RuntimeError(
+                f"No tuning session has been run; call self.tune() first."
+            )
 
-    def _get_app_kwargs(self, cost_model: str, qos_model: str):
-        return {"cost_model": cost_model, "qos_model": qos_model}
+        def get_points(confs, validated):
+            def qos_speedup(conf):
+                return conf.validated_qos if validated else conf.qos, conf.speedup
+
+            sorted_points = np.array(
+                sorted([qos_speedup(c) for c in confs], key=lambda p: p[0])
+            ).T
+            if show_qos_loss:
+                sorted_points[0] = self.baseline_qos - sorted_points[0]
+            return sorted_points
+
+        fig, ax = plt.subplots()
+        kept_confs = get_points(self.kept_configs, False)
+        best_confs = get_points(self.best_configs, False)
+        best_confs_val = get_points(self.best_configs, True)
+        ax.plot(kept_confs[0], kept_confs[1], "o", label="valid")
+        mode = "-o" if connect_best_points else "o"
+        ax.plot(best_confs[0], best_confs[1], mode, label="best")
+        mode = "-o" if connect_best_points else "o"
+        ax.plot(best_confs_val[0], best_confs_val[1], mode, label="best_validated")
+        ax.set_xlabel("QoS Loss" if show_qos_loss else "QoS")
+        ax.set_ylabel("Speedup (x)")
+        ax.legend()
+        return fig
 
     @classmethod
     def _get_config_class(cls) -> Type[Config]:
diff --git a/predtuner/torchapp.py b/predtuner/torchapp.py
index 33ceaad991b005d632b48497230ee28744c9b2df..dac293f4201fa5bc5930a32f846937f7643ddc4e 100644
--- a/predtuner/torchapp.py
+++ b/predtuner/torchapp.py
@@ -153,6 +153,7 @@ class TorchApp(ModeledApp, abc.ABC):
                 target = move_to_device_recursively(target, self.device)
                 qos = self.tensor_to_qos(tensor_output[begin:end], target)
                 qoses.append(qos)
+                begin = end
             return self.combine_qos(np.array(qoses))
 
         p1_storage = self.model_storage / "p1.pkl" if self.model_storage else None