diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py
index e2800fe97d764b9a9f39a10a31151bb7ac5d2ae2..1ca593f1320117a90f13ed1fa2c6b5bd86591123 100644
--- a/predtuner/approxapp.py
+++ b/predtuner/approxapp.py
@@ -1,11 +1,12 @@
 import abc
 import logging
 from pathlib import Path
-from typing import Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import Dict, Generic, List, NamedTuple, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 import matplotlib.pyplot as plt
 from opentuner.measurement.interface import MeasurementInterface
+from opentuner.resultsdb.models import Configuration, Result
 from opentuner.search.manipulator import ConfigurationManipulator, EnumParameter
 
 from ._logging import override_opentuner_config
@@ -41,7 +42,7 @@ class ApproxApp(abc.ABC):
 
     @abc.abstractmethod
     def measure_qos_perf(
-        self, with_approxes: KnobsT, is_testset: bool
+        self, with_approxes: KnobsT, is_calibration: bool
     ) -> Tuple[float, float]:
         pass
 
@@ -58,17 +59,28 @@ class ApproxApp(abc.ABC):
         return ""
 
 
-class Config(NamedTuple):
-    qos: float
-    perf: float
-    knobs: KnobsT
+class Config:
+    def __init__(
+        self, qos: float, calib_qos: Optional[float], perf: float, knobs: KnobsT
+    ) -> None:
+        self.qos = qos
+        self.calib_qos = calib_qos
+        self.perf = perf
+        self.knobs = knobs
+
+
+T = TypeVar("T", bound=Config)
 
 
-class ApproxTuner:
+# ApproxTuner is generic over the type of the config
+# So that the user can use custom Config inherited from Config
+# (in which case they need to override `get_all_configs_from_db`).
+class ApproxTuner(Generic[T]):
     def __init__(self, app: ApproxApp) -> None:
         self.app = app
         self.all_configs = []
         self.kept_configs = []
+        self.best_configs = []
         self.keep_threshold = None
         self._db = None
 
@@ -81,7 +93,9 @@ class ApproxTuner:
         max_iter: int,
         qos_tuner_threshold: float,
         qos_keep_threshold: Optional[float] = None,
-        accuracy_convention: str = "absolute"  # TODO: this
+        accuracy_convention: str = "absolute",  # TODO: this
+        take_best_n: Optional[int] = None,
+        calibrate: bool = True
         # TODO: more parameters + opentuner param forwarding
     ) -> List[Config]:
         """Generate an optimal set of approximation configurations for the model."""
@@ -104,18 +118,41 @@ class ApproxTuner:
 
         # Parse and store results
         self._db = opentuner_args.database
-        self.all_configs = [
-            Config(result.accuracy, result.time, configuration.data)
-            for result, configuration in read_opentuner_db(self._db)
-        ]
+        self.keep_threshold = qos_keep_threshold
+        self.all_configs = list(
+            self.get_all_configs_from_db(read_opentuner_db(self._db))
+        )
         self.kept_configs = [
             cfg for cfg in self.all_configs if cfg.qos > qos_keep_threshold
         ]
-        self.keep_threshold = qos_keep_threshold
-        return self.kept_configs
+        self.best_configs = self.take_best_configs(self.kept_configs, take_best_n)
+        if calibrate:
+            self.calibrate_configs_(self.best_configs)
+        return self.best_configs
+
+    @classmethod
+    def get_all_configs_from_db(
+        cls, results_configs: List[Tuple[Result, Configuration]]
+    ) -> Tuple[T]:
+        return tuple(
+            Config(result.accuracy, None, result.time, configuration.data)
+            for result, configuration in results_configs
+        )
+
+    def calibrate_configs_(self, configs: List[T]):
+        from tqdm import tqdm
+
+        for cfg in tqdm(configs, leave=False):
+            cfg: T
+            if cfg.calib_qos is not None:
+                continue
+            cfg.calib_qos, _ = self.app.measure_qos_perf(cfg.knobs, True)
+            msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.calib_qos} (mean)")
 
-    def take_best_configs(self, n: Optional[int] = None) -> List[Config]:
-        configs = self.kept_configs
+    @staticmethod
+    def take_best_configs(
+        configs: List[Config], n: Optional[int] = None
+    ) -> List[Config]:
         points = np.array([[c.perf, c.qos] for c in configs])
         taken_idx = is_pareto_efficient(points, take_n=n)
         return [configs[i] for i in taken_idx]
diff --git a/predtuner/modeledapp.py b/predtuner/modeledapp.py
index 4c01fa533c8df0f42232dba3dc6e4deb7800390c..6745999558d97ff31730014b834f41c02ad070b8 100644
--- a/predtuner/modeledapp.py
+++ b/predtuner/modeledapp.py
@@ -30,7 +30,7 @@ class ModeledApp(ApproxApp, abc.ABC):
         pass
 
     def empirical_measure_qos_perf(
-        self, with_approxes: KnobsT, is_testset: bool
+        self, with_approxes: KnobsT, is_calibration: bool
     ) -> Tuple[float, float]:
         """Measures QoS and performance by running the program with approximation.
         
@@ -41,7 +41,7 @@ class ModeledApp(ApproxApp, abc.ABC):
     def measure_qos_perf(
         self,
         with_approxes: KnobsT,
-        is_testset: bool,
+        is_calibration: bool,
         qos_model: str = "none",
         perf_model: str = "none",
     ) -> Tuple[float, float]:
@@ -51,12 +51,12 @@ class ModeledApp(ApproxApp, abc.ABC):
         is "none", otherwise only use model indicated by model name.
         """
         # Testset measurement is always empirical
-        if is_testset:
-            return self.empirical_measure_qos_perf(with_approxes, is_testset)
+        if is_calibration:
+            return self.empirical_measure_qos_perf(with_approxes, is_calibration)
         # Run empirical measurement once if either perf or qos needs it
         qos, perf = None, None
         if qos_model == "none" or perf_model == "none":
-            qos, perf = self.empirical_measure_qos_perf(with_approxes, is_testset)
+            qos, perf = self.empirical_measure_qos_perf(with_approxes, is_calibration)
         # If we're asked to use some qos_model, overwrite `qos` value
         # even if we already get it from empirical measure (i.e., even if perf_model == "none")
         if qos_model != "none":
@@ -65,7 +65,7 @@ class ModeledApp(ApproxApp, abc.ABC):
                     f'"{qos_model}" is an invalid value for qos_model '
                     f"(choose from {list(self._qos_models.keys())})"
                 )
-            qos = self._qos_models[qos_model].measure_qos(with_approxes, is_testset)
+            qos = self._qos_models[qos_model].measure_qos(with_approxes, is_calibration)
         # Same goes for perf
         if perf_model != "none":
             if perf_model not in self._perf_models:
@@ -73,7 +73,7 @@ class ModeledApp(ApproxApp, abc.ABC):
                     f'"{perf_model}" is an invalid value for perf_model '
                     f"(choose from {list(self._perf_models.keys())})"
                 )
-            perf = self._perf_models[perf_model].measure_perf(with_approxes, is_testset)
+            perf = self._perf_models[perf_model].measure_perf(with_approxes, is_calibration)
         assert qos is not None and perf is not None
         return qos, perf
 
diff --git a/predtuner/torchapp.py b/predtuner/torchapp.py
index cb4a0a78f6422ac2e0de5cd8c74fcfcf89aaa90b..172abb04e00f8c95c487c0ce4eaab7220c182643 100644
--- a/predtuner/torchapp.py
+++ b/predtuner/torchapp.py
@@ -121,11 +121,11 @@ class TorchApp(ModeledApp, abc.ABC):
 
     @torch.no_grad()
     def empirical_measure_qos_perf(
-        self, with_approxes: KnobsT, is_testset: bool
+        self, with_approxes: KnobsT, is_calibration: bool
     ) -> Tuple[float, float]:
         from time import time_ns
 
-        dataloader = self.test_loader if is_testset else self.val_loader
+        dataloader = self.test_loader if is_calibration else self.val_loader
         approxed = self._apply_knobs(with_approxes)
         qoses = []
 
diff --git a/test/test_torchapp.py b/test/test_torchapp.py
index 75fb06b573adb40191cc58cb9b1022dd4041ef9b..117a6f2e05621aee903af96bc5eb80759037c6b6 100644
--- a/test/test_torchapp.py
+++ b/test/test_torchapp.py
@@ -57,8 +57,13 @@ class TestTorchAppTuner(TestTorchAppInit):
             self.assertTrue(conf.qos > self.baseline - 3.0)
 
     def test_pareto(self):
-        configs = self.tuner.take_best_configs()
+        configs = self.tuner.best_configs
         for c1 in configs:
             self.assertFalse(
                 any(c2.qos > c1.qos and c2.perf > c1.perf for c2 in configs)
             )
+
+    def test_dummy_calib(self):
+        configs = self.tuner.best_configs
+        for c in configs:
+            self.assertAlmostEqual(c.calib_qos, c.qos)