diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py
index 9f0178ff5599baf13e8496b9200d4d5a73a5b57c..8e871df9a660fd2dfbfe83b4839e8a44169254a9 100644
--- a/predtuner/approxapp.py
+++ b/predtuner/approxapp.py
@@ -40,7 +40,7 @@ class ApproxApp(abc.ABC):
 
     @abc.abstractmethod
     def measure_qos_perf(
-        self, with_approxes: KnobsT, is_calibration: bool
+        self, with_approxes: KnobsT, is_test: bool
     ) -> Tuple[float, float]:
         pass
 
@@ -68,12 +68,12 @@ class ApproxApp(abc.ABC):
 
 class Config:
     def __init__(
-        self, qos: float, perf: float, knobs: KnobsT, calib_qos: Optional[float] = None
+        self, qos: float, perf: float, knobs: KnobsT, test_qos: Optional[float] = None
     ) -> None:
         self.qos = qos
         self.perf = perf
         self.knobs = knobs
-        self.calib_qos: Optional[float] = calib_qos
+        self.test_qos: Optional[float] = test_qos
 
 
 T = TypeVar("T", bound=Config)
@@ -103,7 +103,7 @@ class ApproxTuner(Generic[T]):
         qos_keep_threshold: Optional[float] = None,
         is_threshold_relative: bool = False,
         take_best_n: Optional[int] = None,
-        calibrate: bool = True,
+        test_configs: bool = True,
         **kwargs
         # TODO: more parameters + opentuner param forwarding
     ) -> List[T]:
@@ -158,20 +158,20 @@ class ApproxTuner(Generic[T]):
             len(self.kept_configs),
             len(self.best_configs),
         )
-        if calibrate:
-            msg_logger.info("Calibrating configurations on calibration inputs")
-            self.calibrate_configs_(self.best_configs)
+        if test_configs:
+            msg_logger.info("Checking configurations on test inputs")
+            self.test_configs_(self.best_configs)
         return self.best_configs
 
-    def calibrate_configs_(self, configs: List[T]):
+    def test_configs_(self, configs: List[T]):
         from tqdm import tqdm
 
         for cfg in tqdm(configs, leave=False):
             cfg: T
-            if cfg.calib_qos is not None:
+            if cfg.test_qos is not None:
                 continue
-            cfg.calib_qos, _ = self.app.measure_qos_perf(cfg.knobs, True)
-            msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.calib_qos} (mean)")
+            cfg.test_qos, _ = self.app.measure_qos_perf(cfg.knobs, True)
+            msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.test_qos} (mean)")
 
     @staticmethod
     def take_best_configs(configs: List[T], n: Optional[int] = None) -> List[T]:
diff --git a/predtuner/modeledapp.py b/predtuner/modeledapp.py
index 2f02418da78a3505d1963e8f5cba68d4327e0278..6160c989daed68f545097bee42c76373a60e0cd3 100644
--- a/predtuner/modeledapp.py
+++ b/predtuner/modeledapp.py
@@ -42,10 +42,10 @@ class ModeledApp(ApproxApp, abc.ABC):
         pass
 
     def empirical_measure_qos_perf(
-        self, with_approxes: KnobsT, is_calibration: bool
+        self, with_approxes: KnobsT, is_test: bool
     ) -> Tuple[float, float]:
         """Measures QoS and performance by running the program with approximation.
-        
+
         An implementation is not necessary if empirical measurement is never intended.
         """
         raise NotImplementedError()
@@ -53,7 +53,7 @@ class ModeledApp(ApproxApp, abc.ABC):
     def measure_qos_perf(
         self,
         with_approxes: KnobsT,
-        is_calibration: bool,
+        is_test: bool,
         qos_model: str = "none",
         perf_model: str = "none",
     ) -> Tuple[float, float]:
@@ -63,12 +63,12 @@ class ModeledApp(ApproxApp, abc.ABC):
         is "none", otherwise only use model indicated by model name.
         """
         # Testset measurement is always empirical
-        if is_calibration:
-            return self.empirical_measure_qos_perf(with_approxes, is_calibration)
+        if is_test:
+            return self.empirical_measure_qos_perf(with_approxes, is_test)
         # Run empirical measurement once if either perf or qos needs it
         qos, perf = None, None
         if qos_model == "none" or perf_model == "none":
-            qos, perf = self.empirical_measure_qos_perf(with_approxes, is_calibration)
+            qos, perf = self.empirical_measure_qos_perf(with_approxes, is_test)
         # If we're asked to use some qos_model, overwrite `qos` value
         # even if we already get it from empirical measure (i.e., even if perf_model == "none")
         if qos_model != "none":
@@ -170,13 +170,13 @@ class LinearPerfModel(IPerfModel):
 
 class QoSModelP1(IQoSModel):
     """QoS model `P1` in ApproxTuner.
-    
+
     tensor_output_getter: Run the tensor-based application with config `with_approxes` applied,
         and return a single tensor result.
 
         Note that while we require the return value to be a PyTorch tensor,
         user is free to implement this on non-PyTorch applications.
-    
+
     qos_metric: Compute a Quality of Service level from the tensor output of application
     """
 
@@ -338,10 +338,10 @@ class ValConfig(Config):
         qos: float,
         perf: float,
         knobs: KnobsT,
-        calib_qos: Optional[float] = None,
+        test_qos: Optional[float] = None,
         validated_qos: Optional[float] = None,
     ) -> None:
-        super().__init__(qos, perf, knobs, calib_qos)
+        super().__init__(qos, perf, knobs, test_qos)
         self.validated_qos = validated_qos
 
 
@@ -355,8 +355,8 @@ class ApproxModeledTuner(ApproxTuner):
         qos_keep_threshold: Optional[float] = None,
         is_threshold_relative: bool = False,
         take_best_n: Optional[int] = None,
-        calibrate: bool = True,
-        validate: Optional[bool] = None,
+        test_configs: bool = True,
+        validate_configs: Optional[bool] = None,
         perf_model: str = "none",
         qos_model: str = "none",
     ) -> List[ValConfig]:
@@ -381,16 +381,16 @@ class ApproxModeledTuner(ApproxTuner):
             qos_keep_threshold=qos_keep_threshold,
             is_threshold_relative=is_threshold_relative,
             take_best_n=take_best_n,
-            calibrate=calibrate,
+            test_configs=test_configs,
             perf_model=perf_model,
             qos_model=qos_model,
         )
-        if validate is None and qos_model != "none":
+        if validate_configs is None and qos_model != "none":
             msg_logger.info(
                 'Validating configurations due to using qos model "%s"', qos_model
             )
             self.validate_configs_(self.best_configs)
-        elif validate:
+        elif validate_configs:
             msg_logger.info("Validating configurations as user requested")
             self.validate_configs_(self.best_configs)
         return ret
@@ -403,7 +403,7 @@ class ApproxModeledTuner(ApproxTuner):
             if cfg.validated_qos is not None:
                 continue
             cfg.validated_qos, _ = self.app.measure_qos_perf(cfg.knobs, False)
-            msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {cfg.calib_qos} (mean)")
+            msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {cfg.test_qos} (mean)")
 
     def _get_app_kwargs(self, perf_model: str, qos_model: str):
         return {"perf_model": perf_model, "qos_model": qos_model}
diff --git a/predtuner/torchapp.py b/predtuner/torchapp.py
index 6cad524cc8cb0293867d70820e71f0c8acccb861..0160c0cc14950abe0df068d84934c76224ca0939 100644
--- a/predtuner/torchapp.py
+++ b/predtuner/torchapp.py
@@ -54,8 +54,8 @@ class TorchApp(ModeledApp, abc.ABC):
         self,
         app_name: str,
         module: Module,
-        val_loader: DataLoader,
-        test_loader: DataLoader,
+        tune_dataloader: DataLoader,
+        test_dataloader: DataLoader,
         knobs: Set[TorchApproxKnob],
         tensor_to_qos: Callable[[torch.Tensor, Any], float],
         combine_qos: Callable[[np.ndarray], float] = np.mean,
@@ -64,8 +64,8 @@ class TorchApp(ModeledApp, abc.ABC):
     ) -> None:
         self.app_name = app_name
         self.module = module
-        self.val_loader = val_loader
-        self.test_loader = test_loader
+        self.tune_loader = tune_dataloader
+        self.test_loader = test_dataloader
         self.name_to_knob = {k.name: k for k in self._check_baseline_knob(knobs)}
         self.tensor_to_qos = tensor_to_qos
         self.combine_qos = combine_qos
@@ -100,11 +100,11 @@ class TorchApp(ModeledApp, abc.ABC):
 
     def get_models(self) -> List[Union[IPerfModel, IQoSModel]]:
         def batched_valset_qos(tensor_output: torch.Tensor):
-            dataset_len = len(self.val_loader.dataset)
+            dataset_len = len(self.tune_loader.dataset)
             assert len(tensor_output) == dataset_len
             begin = 0
             qoses = []
-            for _, target in self.val_loader:
+            for _, target in self.tune_loader:
                 end = begin + len(target)
                 target = move_to_device_recursively(target, self.device)
                 qos = self.tensor_to_qos(tensor_output[begin:end], target)
@@ -121,11 +121,11 @@ class TorchApp(ModeledApp, abc.ABC):
 
     @torch.no_grad()
     def empirical_measure_qos_perf(
-        self, with_approxes: KnobsT, is_calibration: bool
+        self, with_approxes: KnobsT, is_test: bool
     ) -> Tuple[float, float]:
         from time import time_ns
 
-        dataloader = self.test_loader if is_calibration else self.val_loader
+        dataloader = self.test_loader if is_test else self.tune_loader
         approxed = self._apply_knobs(with_approxes)
         qoses = []
 
@@ -151,7 +151,7 @@ class TorchApp(ModeledApp, abc.ABC):
     def _get_raw_output_valset(self, with_approxes: KnobsT):
         approxed = self._apply_knobs(with_approxes)
         all_outputs = []
-        for inputs, _ in self.val_loader:
+        for inputs, _ in self.tune_loader:
             inputs = move_to_device_recursively(inputs, self.device)
             outputs = approxed(inputs)
             all_outputs.append(outputs)
@@ -183,7 +183,7 @@ class TorchApp(ModeledApp, abc.ABC):
         return module_indexer.module
 
     def _sample_input(self):
-        inputs, _ = next(iter(self.val_loader))
+        inputs, _ = next(iter(self.tune_loader))
         return inputs.to(self.device)
 
 
diff --git a/test/integrated_tuning.py b/test/integrated_tuning.py
index 93c370449bac50ed1a438e801851dab1b890c6d4..20682142e4dd69d977ff45cf959665fe983fe5f4 100644
--- a/test/integrated_tuning.py
+++ b/test/integrated_tuning.py
@@ -15,11 +15,11 @@ dataset = CIFAR.from_file(
     "model_data/cifar10/input.bin", "model_data/cifar10/labels.bin"
 )
 tune_loader = DataLoader(Subset(dataset, range(500)), batch_size=500)
-calib_loader = DataLoader(Subset(dataset, range(5000, 5500)), batch_size=500)
+test_loader = DataLoader(Subset(dataset, range(5000, 5500)), batch_size=500)
 module = VGG16Cifar10()
 module.load_state_dict(torch.load("model_data/vgg16_cifar10.pth.tar"))
 app = TorchApp(
-    "TestTorchApp", module, tune_loader, calib_loader, get_knobs_from_file(), accuracy,
+    "TestTorchApp", module, tune_loader, test_loader, get_knobs_from_file(), accuracy,
     model_storage_folder="tuner_results/vgg16_cifar10"
 )
 baseline, _ = app.measure_qos_perf({}, False)
diff --git a/test/test_torchapp.py b/test/test_torchapp.py
index 2b4650bef46c08fdf2b816b5272a913e4281d476..0f58e07a542f5fc0a48aa1a2f22b5786d64b96c5 100644
--- a/test/test_torchapp.py
+++ b/test/test_torchapp.py
@@ -82,10 +82,10 @@ class TestTorchAppTunerResult(TorchAppSetUp):
                 any(c2.qos > c1.qos and c2.perf > c1.perf for c2 in configs)
             )
 
-    def test_dummy_calib(self):
+    def test_dummy_testset(self):
         configs = self.tuner.best_configs
         for c in configs:
-            self.assertAlmostEqual(c.calib_qos, c.qos)
+            self.assertAlmostEqual(c.test_qos, c.qos)
 
 
 class TestModeledTuning(TorchAppSetUp):