diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..2da9be9ac4ecfec68edac79f118cfe0be5e99862 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "examples/stand_count/pytorch-ssd"] + path = examples/stand_count/pytorch-ssd + url = git@github.com:qfgaohao/pytorch-ssd.git diff --git a/README.md b/README.md index 41c48098915389597067dccba0c3ddb02c608155..2ec57bbf2e55fe7e61edf4da212cbb944cb62c42 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,42 @@ # Autotuning and Predictive Autotuning -Performs autotuning on program approximation knobs using an error-predictive proxy in place of the -original program, to greatly speedup autotuning while getting results comparable in quality. +`predtuner` performs autotuning on program approximation knobs using an error-predictive proxy +in place of the original program, to greatly speedup autotuning while getting results +comparable in quality. Work in progress. ## Requirements -Prerequisite packages are listed in `./env.yaml`. Conda is the validated and recommended way to set -up a working environment. If you're using conda, do +`pip` is needed for installing this package. At the root directory of this repo, do: ```bash -conda env create -n predtuner -f env.yaml -conda activate predtuner +pip install -e . ``` + +`-e` can be omitted if you don't intend to modify the code in this package. + +## Getting Started + +The documentation page contains a full tutorial. +Build the documentation by: + +```bash +pip install sphinx sphinx_rtd_theme sphinx_autodoc_typehints +cd doc +make html +``` + +The documentation page will be created as `doc/build/html/index.html`. +You can open this in the browser and browse to "Getting Started" section. + +### Model Data for Example / Testing + +`predtuner` contains 10 demo models which are also used in tests. + +- Download and extract [this](https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u/view?usp=sharing) file containing all 10 models, for testing purposes. +- The "Getting Started" example on the documentation page only uses VGG16-CIFAR10. + If you don't need the other models, get the data for VGG16-CIFAR10 + [here](https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing). + +In either case, there should be a `model_params/` folder at the root of repo after extraction. diff --git a/doc/getting_started.rst b/doc/getting_started.rst index 75381aa720fdee717d20db115ecbc74e73deff4c..5d8472a5a224b6677df4f080bf6309bff6bae016 100644 --- a/doc/getting_started.rst +++ b/doc/getting_started.rst @@ -1,2 +1,164 @@ Getting Started =================== + +This guide can help you start working with PredTuner. + +Installation +------------ + +Install PredTuner from source using `pip`: + +.. code-block:: shell + + pip install -e . + +PredTuner will also be available on PyPi in the future after we publish the first release. + +Tuning a PyTorch DNN +-------------------- + +PredTuner can tune any user-defined application, +but it is optimized for tuning DNN applications defined in PyTorch. + +We will use models predefined in PredTuner for demonstration purposes. +Download pretrained VGG16 model parameters and CIFAR10 dataset from `here +<https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_. +After extraction, there should be a `model_params/` folder in current directory. + +Load the tuning and test subsets of CIFAR10 dataset, and create a pretrained VGG16 model: + +.. code-block:: python + + from pathlib import Path + import predtuner as pt + from predtuner.model_zoo import CIFAR, VGG16Cifar10 + + prefix = Path("model_params/vgg16_cifar10") + tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin") + tune_loader = DataLoader(tune_set, batch_size=500) + test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin") + test_loader = DataLoader(test_set, batch_size=500) + + module = VGG16Cifar10() + module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar")) + +PredTuner provides a logging mechanism. +While not required, it's recommended that you set up the logger output into a file: + +.. code-block:: python + + msg_logger = pt.config_pylogger(output_dir="vgg16_cifar10/", verbose=True) + +For each tuning task, both a tuning dataset and a test dataset is required. +The tuning dataset is used to evaluate the accuracy of application in the autotuning stage, +while the test dataset is used to evaluate configurations found in autotuning. +This is similar to the split between training and validation set in machine learning tasks. +In this case, both tuning and test datasets contain 5000 images. + +Create an instance of `TorchApp` for tuning PyTorch DNN: + +.. code-block:: python + + app = pt.TorchApp( + "TestTorchApp", # Application name -- can be anything + module, + tune_loader, + test_loader, + knobs=pt.get_knobs_from_file(), + tensor_to_qos=pt.accuracy, + model_storage_folder="vgg16_cifar10/", + ) + +PredTuner provides `TorchApp`, which is specialized for the use scenario of tuning PyTorch DNNs. +In addition, two more functions from PredTuner are used: + +`pt.accuracy` is the *classification accuracy* metric, +which receives the probability distribution output from the VGG16 model, +compare it to the groundtruth in the dataset, +and returns a scalar between 0 and 100 for the classification accuracy + +`pt.get_knobs_from_file()` returns a set of approximations preloaded in PredTuner, +which are applied to `torch.nn.Conv2d` layers. +See ??? for these approximations and how to define custom approximations. + +Now we can obtain a tuner object from the application and start tuning. +We will keep configurations that don't exceed 3% loss of accuracy, +but encourage the tuner to find configurations with loss of accuracy below 2.1%. + +.. code-block:: python + + tuner = app.get_tuner() + tuner.tune( + max_iter=500, + qos_tuner_threshold=2.1, # QoS threshold to guide tuner into + qos_keep_threshold=3.0, # QoS threshold for which we actually keep the configurations + is_threshold_relative=True, # Thresholds are relative to baseline -- baseline_acc - 2.1 + take_best_n=50, + cost_model="cost_linear", # Use linear cost predictor + ) + +**QoS** (quality of service) is a general term for the quality of application after approximations are applied; +e.g., here it refers to the accuracy of DNN over given datasets. +We will be using the term QoS throughout the tutorials. + +`max_iter` defines the number of iterations to use in autotuning. +Within 500 iterations, PredTuner should find about 200 valid configurations. +PredTuner will also automatically mark out `Pareto-optimal +<https://en.wikipedia.org/wiki/Pareto_efficiency>`_ +configurations. +These are called "best" configurations (`tuner.best_configs`), +in contrast to "valid" configurations which are the configurations that satisfy our accuracy requirements +(`tuner.kept_configs`). +`take_best_n` allows taking some extra close-optimal configurations in addition to Pareto-optimal ones. + +500 iterations is for demonstration; in practice, +at least 10000 iterations are necessary on VGG16-sized models to converge to a set of good configurations. +Depending on hardware performance, this tuning should take several minutes to several tens of minutes. + +Saving Tuning Results +--------------------- + +Now the `tuner` object holds the tuning results, +we can export it into a json file, +and visualize all configurations in a figure: + +.. code-block:: python + + tuner.dump_configs("vgg16_cifar10/configs.json", best_only=False) + fig = tuner.plot_configs(show_qos_loss=True) + fig.savefig("vgg16_cifar10/configs.png") + +The generated figure should look like this: + +.. image:: tuning_result.png + +where the blue points shows the QoS and speedup of all valid configurations, +and the "best" configurations are marked out in orange. + +Autotuning with a QoS Model +------------------------------------- + +The previous tuning session shown above is already slow, +and will be much slower with larger models, more iterations, and multiple tuning thresholds. +Instead, we can use a *QoS prediction model* which predicts the QoS, +with some inaccuracies, but much faster than running the application. +To do that, simply use the argument `qos_model` when calling `tuner.tune()`: + +.. code-block:: python + + tuner = app.get_tuner() + tuner.tune( + max_iter=500, + qos_tuner_threshold=2.1, # QoS threshold to guide tuner into + qos_keep_threshold=3.0, # QoS threshold for which we actually keep the configurations + is_threshold_relative=True, # Thresholds are relative to baseline -- baseline_acc - 2.1 + take_best_n=50, + cost_model="cost_linear", # Use linear cost predictor + qos_model="qos_p1" + ) + +The QoS model will first undergo a initialization stage (takes a bit of time), +when it learns about the behavior of each knob on each operator (DNN layer). +Because the configurations will end up with predicted QoS values after tuning, +this will add a *validation* stage at the end of tuning where the QoS of best configurations are empirically measured, +and the bad ones are removed. diff --git a/doc/index.rst b/doc/index.rst index 6866a98b3d779063329219cd2c49f8b2c6feb41d..b79d28a42bd60549b7e2921b29854948e0aece48 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -9,27 +9,21 @@ PredTuner performs autotuning on approximation choices for a program using an error-predictive proxy instead of executing the program, to greatly speedup autotuning while getting results of comparable quality. -PredTuner is a contribution of [ApproxTuner] -(https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations). - -Short-term Goals -- Measure accuracy impact of approximations -- Obtain a tuned, approximated CNN in <5 lines of code -- Easy to manage multiple approximation configs -- Easy to load and manage prior tuning results -- Flexible retraining support -Possible Long-term Goals -- High-performance implementations of approximate layers -- Allow users to register their own approximations -- Support for other frameworks: TF, ONNX, JAX +PredTuner is a main component of `ApproxTuner +<https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations>`_. -Documentation -------------- -.. only:: html +Solution for Efficient Approximation Autotuning +----------------------------------------------- + +- Start a tuning session in 10 lines of code +- Deep integration with PyTorch for DNN supports +- Multiple levels of APIs for generality and ease-of-use +- Effective accuracy prediction models +- Easily store and visualize tuning results in many formats - :Release: |version| - :Date: |today| +Documentation +------------- .. toctree:: :maxdepth: 1 @@ -41,6 +35,3 @@ Indices and tables ------------------ * :ref:`genindex` -* :ref:`modindex` -* :ref:`search` -* :ref:`glossary` diff --git a/doc/tuning_result.png b/doc/tuning_result.png new file mode 100644 index 0000000000000000000000000000000000000000..6210102982aefa78a796a7a8593fe766e802dbf9 Binary files /dev/null and b/doc/tuning_result.png differ diff --git a/examples/stand_count/metric.py b/examples/stand_count/metric.py new file mode 100644 index 0000000000000000000000000000000000000000..101b0792dc29de78b66afe877b56ed91212b9362 --- /dev/null +++ b/examples/stand_count/metric.py @@ -0,0 +1,38 @@ +import numpy as np +import torch +from mean_average_precision import MeanAveragePrecision2d + + +def calculate_mAP(gt_boxes, pred_boxes, pred_scores, style="pascal"): + if style == "pascal": + settings = { + "iou_thresholds": 0.5, + "recall_thresholds": np.arange(0.0, 1.1, 0.1), + } + elif style == "coco": + settings = { + "iou_thresholds": np.arange(0.5, 1.0, 0.05), + "recall_thresholds": np.arange(0.0, 1.01, 0.01), + "mpolicy": "soft", + } + else: + raise ValueError(f"Unrecognized style {style}") + + map_calc = MeanAveragePrecision2d(num_classes=1) + torch2np = lambda x: x.detach().cpu().numpy() + for gt_boxes_, pred_boxes_, pred_scores_ in zip(gt_boxes, pred_boxes, pred_scores): + # Both groundtruth and predicted needs to be ndarray. + # Format for groundtruth: [xmin, ymin, xmax, ymax, class_id, difficult, crowd] + # Format for predicted: [xmin, ymin, xmax, ymax, class_id, confidence] + n_gt, n_pred = len(gt_boxes_), len(pred_boxes_) + gt_np = np.zeros((n_gt, 7)) + gt_np[:, :4] = torch2np(gt_boxes_) + gt_np[:, 4] = 0 # Class ID + pred_np = np.zeros((n_pred, 6)) + pred_np[:, :4] = torch2np(pred_boxes_) + pred_np[:, 4] = 0 # Class ID + pred_np[:, 5] = torch2np(pred_scores_) + map_calc.add(pred_np, gt_np) + # Calculate mAP + mAP = map_calc.value(**settings)["mAP"] + return torch.tensor(mAP) diff --git a/examples/stand_count/pytorch-ssd b/examples/stand_count/pytorch-ssd new file mode 160000 index 0000000000000000000000000000000000000000..f61ab424d09bf3d4bb3925693579ac0a92541b0d --- /dev/null +++ b/examples/stand_count/pytorch-ssd @@ -0,0 +1 @@ +Subproject commit f61ab424d09bf3d4bb3925693579ac0a92541b0d diff --git a/examples/stand_count/ssd.py b/examples/stand_count/ssd.py new file mode 100644 index 0000000000000000000000000000000000000000..bfca965ca1846fd7843ca83ad3fa749a75d0f1ab --- /dev/null +++ b/examples/stand_count/ssd.py @@ -0,0 +1,211 @@ +from pathlib import Path +from typing import Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch.nn.modules.module import Module +from torch.utils.data.dataloader import default_collate + +from .vision.nn.multibox_loss import MultiboxLoss +from .vision.ssd.ssd import SSD, MatchPrior +from .vision.utils import box_utils + +# common across ssd configs: +center_variance = 0.1 +size_variance = 0.2 + + +class SSDWrapper(Module): + def __init__( + self, + net: SSD, + size: int, + image_mean: np.ndarray, + image_std: float, + priors: torch.Tensor, + ) -> None: + Module.__init__(self) + self.net = net.eval() + self.input_shape = 1, 3, size, size + mean = torch.as_tensor(image_mean).unsqueeze(-1).unsqueeze(-1) + self.register_buffer("mean", mean) + self.transform = lambda img: (F.interpolate(img, size) - self.mean) / image_std + self.register_buffer("priors", priors) + self.pprocessor = BoxPostprocessor() + self._scripting = False + + def step0(self, image: torch.Tensor): + scores, locations = self.net.forward(self.transform(image)) + return scores, locations + + def step1(self, scores: torch.Tensor, locations: torch.Tensor): + confidences = F.softmax(scores, dim=2) + locations = box_utils.convert_locations_to_boxes( + locations, + self.priors, + center_variance, + size_variance, + ) + boxes = box_utils.center_form_to_corner_form(locations) + return confidences, boxes + + def step2(self, confidences: torch.Tensor, boxes: torch.Tensor): + postprocessed = [ + self.pprocessor(image_scores, image_boxes) + for image_scores, image_boxes in zip(confidences, boxes) + ] + boxes, labels, confidences = zip(*postprocessed) + return boxes, labels, confidences + + def forward(self, image: torch.Tensor): + scores, locations = self.step0(image) + if self.training: + return scores, locations + confidences, boxes = self.step1(scores, locations) + if self._scripting: + return confidences, boxes + boxes, labels, confidences = self.step2(confidences, boxes) + return boxes, labels, confidences, (scores, locations) + + +def get_mobilenetv2_lite(prefix: str) -> SSDWrapper: + from .vision.ssd.config import mobilenetv1_ssd_config as config + from .vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite + + ssd_model = create_mobilenetv2_ssd_lite( + 2, onnx_compatible=True, is_test=False + ).eval() + ssd_model.init_from_pretrained_ssd(Path(prefix) / "mb2-ssd-lite-mp-0_686.pth") + wrapper = SSDWrapper( + ssd_model, config.image_size, config.image_mean, config.image_std, config.priors + ) + return wrapper + + +def get_vgg16(prefix: str) -> SSDWrapper: + from .vision.ssd.config import vgg_ssd_config as config + from .vision.ssd.vgg_ssd import create_vgg_ssd + + ssd_model = create_vgg_ssd(2, is_test=False).eval() + ssd_model.init_from_pretrained_ssd(Path(prefix) / "vgg16-ssd-mp-0_7726.pth") + wrapper = SSDWrapper( + ssd_model, config.image_size, config.image_mean, config.image_std, config.priors + ) + return wrapper + + +class LossAndMAP: + def __init__(self, priors: torch.Tensor, device: Union[str, torch.device]) -> None: + priors = priors.cpu() + self.loss_fn = MultiboxLoss( + priors, + iou_threshold=0.5, + neg_pos_ratio=3, + center_variance=0.1, + size_variance=0.2, + device=device, + ) + self.dataset_target_tr = MatchPrior(priors, center_variance, size_variance, 0.5) + self.device = device + + def __call__(self, model_output, targets): + if len(model_output) == 2: + # Training outputs. + scores, locations = model_output + boxes = confidences = [] + else: + # Evaluation outputs. + boxes, _, confidences, (scores, locations) = model_output + matched_labels, matched_boxes, gt_boxes = self._process_targets(targets) + reg_loss, cls_loss = self.loss_fn( + scores, locations, matched_labels, matched_boxes + ) + return reg_loss, cls_loss, gt_boxes, boxes, confidences + + def avg_reg_loss(self, loss_and_map): + reg_losses = list(zip(*loss_and_map))[0] + return torch.stack(reg_losses, 0).mean() + + def avg_cls_loss(self, loss_and_map): + cls_losses = list(zip(*loss_and_map))[1] + return torch.stack(cls_losses, 0).mean() + + def all_losses(self, loss_and_map): + return self.avg_reg_loss(loss_and_map) + self.avg_cls_loss(loss_and_map) + + def combine_map(self, loss_and_map): + from .metric import calculate_mAP + + flatten = lambda xss: [x for xs in xss for x in xs] + _, _, gt_boxes, boxes, confidences = zip(*loss_and_map) + gt_boxes = flatten(gt_boxes) + boxes, confidences = flatten(boxes), flatten(confidences) + return -calculate_mAP(gt_boxes, boxes, confidences) + + def _process_targets(self, targets): + gt_boxes, matched_boxes, matched_labels = [], [], [] + for t in targets: + this_boxes, this_labels = t["boxes"], t["labels"] + gt_boxes.append(this_boxes) + this_boxes_, this_labels_ = self.dataset_target_tr( + this_boxes.cpu(), this_labels.cpu() + ) + matched_boxes.append(this_boxes_) + matched_labels.append(this_labels_) + matched_boxes = default_collate(matched_boxes).to(self.device) + matched_labels = default_collate(matched_labels).to(self.device) + return matched_labels, matched_boxes, gt_boxes + + +class BoxPostprocessor(Module): + def __init__( + self, + nms_method=None, + iou_threshold=0.45, + filter_threshold=0.50, + candidate_size=200, + sigma=0.5, + ): + super().__init__() + self.iou_threshold = iou_threshold + self.filter_threshold = filter_threshold + self.candidate_size = candidate_size + self.nms_method = nms_method + self.sigma = sigma + + def forward(self, scores, boxes, top_k=-1, prob_threshold=None): + cpu_device = torch.device("cpu") + prob_threshold = prob_threshold or self.filter_threshold + # this version of nms is slower on GPU, so we move data to CPU. + boxes = boxes.to(cpu_device) + scores = scores.to(cpu_device) + picked_box_probs = [] + picked_labels = [] + for class_index in range(1, scores.size(1)): + probs = scores[:, class_index] + mask = probs > prob_threshold + probs = probs[mask] + if probs.size(0) == 0: + continue + subset_boxes = boxes[mask, :] + box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1) + box_probs = box_utils.nms( + box_probs, + self.nms_method, + score_threshold=prob_threshold, + iou_threshold=self.iou_threshold, + sigma=self.sigma, + top_k=top_k, + candidate_size=self.candidate_size, + ) + picked_box_probs.append(box_probs) + picked_labels.extend([class_index] * box_probs.size(0)) + if not picked_box_probs: + return torch.zeros(0, 4), torch.tensor([]), torch.tensor([]) + picked_box_probs = torch.cat(picked_box_probs) + return ( + picked_box_probs[:, :4], + torch.tensor(picked_labels), + picked_box_probs[:, 4], + ) # boxes, labels, scores diff --git a/examples/stand_count/vision b/examples/stand_count/vision new file mode 120000 index 0000000000000000000000000000000000000000..c69be8baf28e36e160cc6935b5cb768b7754b6a7 --- /dev/null +++ b/examples/stand_count/vision @@ -0,0 +1 @@ +pytorch-ssd/vision \ No newline at end of file diff --git a/examples/tune_vgg16_cifar10.py b/examples/tune_vgg16_cifar10.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8e5a8a019f988fe578c7983d0cd0c60e6ee108 --- /dev/null +++ b/examples/tune_vgg16_cifar10.py @@ -0,0 +1,55 @@ +import site +from pathlib import Path + +import torch +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Subset + +site.addsitedir(Path(__file__).parent.parent.absolute().as_posix()) +from predtuner import TorchApp, accuracy, config_pylogger, get_knobs_from_file +from predtuner.model_zoo import CIFAR, VGG16Cifar10 + +# Set up logger to put log file in /tmp +msg_logger = config_pylogger(output_dir="/tmp", verbose=True) + +# Load "tuning" dataset and "test" dataset, +# and use only the first 500 images from each dataset as an example +# TODO: you should use all (5000) images for actual tuning. +prefix = Path("model_params/vgg16_cifar10") +tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin") +tune_loader = DataLoader(tune_set, batch_size=500) +test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin") +test_loader = DataLoader(test_set, batch_size=500) + +# Load checkpoint for VGG16 (CIFAR10) +module = VGG16Cifar10() +module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar")) +app = TorchApp( + "TestTorchApp", # name -- can be anything + module, + tune_loader, + test_loader, + get_knobs_from_file(), # default knobs -- see "predtuner/approxes/default_approx_params.json" + accuracy, # the QoS metric to use -- classification accuracy + # Where to serialize prediction models if they are used + # For example, if you use p1 (see below), this will leave you a + # tuner_results/vgg16_cifar10/p1.pkl + # which can be quickly reloaded the next time you do tuning with + model_storage_folder="tuner_results/vgg16_cifar10", +) +# This is how to measure baseline accuracy -- {} means no approximation +baseline, _ = app.measure_qos_cost({}, False) +# Get a tuner object and start tuning! +tuner = app.get_tuner() +tuner.tune( + max_iter=500, # TODO: In practice, use at least 5000, or 10000 + qos_tuner_threshold=2.1, # QoS threshold to guide tuner into + qos_keep_threshold=3.0, # QoS threshold for which we actually keep the configurations + is_threshold_relative=True, # Thresholds are relative to baseline -- baseline_acc - 2.1 + cost_model="cost_linear", # Use linear performance predictor + qos_model="qos_p1", # Use P1 QoS predictor +) +# Save configs here when you're done +tuner.dump_configs("tuner_results/vgg16_cifar10_configs.json") +fig = tuner.plot_configs(show_qos_loss=True) +fig.savefig("tuner_results/vgg16_cifar10_configs.png") \ No newline at end of file diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py index b420d8deaf35ad3ac25f8d3a6ef507997c9d3116..06717bd15011564f33a555931e30e3bcb2b34f7a 100644 --- a/predtuner/approxapp.py +++ b/predtuner/approxapp.py @@ -116,10 +116,6 @@ class Config: @property def speedup(self): return 1 / self.cost - - @property - def qos_speedup(self): - return self.qos, self.speedup T = TypeVar("T", bound=Config) @@ -151,7 +147,7 @@ class ApproxTuner(Generic[T]): is_threshold_relative: bool = False, take_best_n: Optional[int] = None, test_configs: bool = True, - **kwargs + app_kwargs: dict = None # TODO: more parameters + opentuner param forwarding ) -> List[T]: from opentuner.tuningrunmain import TuningRunMain @@ -173,7 +169,7 @@ class ApproxTuner(Generic[T]): qos_tuner_threshold, qos_keep_threshold, is_threshold_relative, - self._get_app_kwargs(**kwargs), + app_kwargs or {}, ) assert self.keep_threshold is not None trm = TuningRunMain(tuner, opentuner_args) @@ -206,27 +202,41 @@ class ApproxTuner(Generic[T]): len(self.best_configs), ) if test_configs: - msg_logger.info("Checking configurations on test inputs") - self.test_configs_(self.best_configs) + msg_logger.info("Calibrating configurations on test inputs") + self.best_configs = self.test_configs(self.best_configs) return self.best_configs - def test_configs_(self, configs: List[T]): + def test_configs(self, configs: List[Config]): + from copy import deepcopy + from tqdm import tqdm + assert self.keep_threshold is not None + if not configs: + return [] + ret_configs = [] + total_error = 0 for cfg in tqdm(configs, leave=False): - cfg: T - if cfg.test_qos is not None: - continue + cfg = deepcopy(cfg) + assert cfg.test_qos is None cfg.test_qos, _ = self.app.measure_qos_cost(cfg.knobs, True) msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.test_qos} (mean)") + total_error += abs(cfg.qos - cfg.test_qos) + if cfg.test_qos > self.keep_threshold: + ret_configs.append(cfg) + else: + msg_logger.debug("Config removed") + mean_err = total_error / len(configs) + msg_logger.info("QoS mean abs difference of calibration: %f", mean_err) + return ret_configs @staticmethod def take_best_configs(configs: List[T], n: Optional[int] = None) -> List[T]: - points = np.array([c.qos_speedup for c in configs]) + points = np.array([(c.qos, c.speedup) for c in configs]) taken_idx = is_pareto_efficient(points, take_n=n) return [configs[i] for i in taken_idx] - def dump_configs(self, filepath: PathLike): + def dump_configs(self, filepath: PathLike, best_only: bool = True): import os from jsonpickle import encode @@ -237,20 +247,27 @@ class ApproxTuner(Generic[T]): ) filepath = Path(filepath) os.makedirs(filepath.parent, exist_ok=True) + confs = self.best_configs if best_only else self.kept_configs with filepath.open("w") as f: - f.write(encode(self.best_configs, indent=2)) + f.write(encode(confs, indent=2)) def plot_configs( - self, show_qos_loss: bool = False, connect_best_points: bool = False + self, + show_qos_loss: bool = False, + connect_best_points: bool = False, + use_test_qos: bool = False, ) -> plt.Figure: if not self.tuned: raise RuntimeError( f"No tuning session has been run; call self.tune() first." ) + def qos_speedup(conf): + return conf.test_qos if use_test_qos else conf.qos, conf.speedup + def get_points(confs): sorted_points = np.array( - sorted([c.qos_speedup for c in confs], key=lambda p: p[0]) + sorted([qos_speedup(c) for c in confs], key=lambda p: p[0]) ).T if show_qos_loss: sorted_points[0] = self.baseline_qos - sorted_points[0] @@ -297,9 +314,6 @@ class ApproxTuner(Generic[T]): **app_kwargs, ) - def _get_app_kwargs(self, **kwargs): - return {} - @classmethod def _get_config_class(cls) -> Type[Config]: return Config diff --git a/predtuner/modeledapp.py b/predtuner/modeledapp.py index adeb53383a81d0d03db9642e0dfde17896462ee9..a5d904a6a4234be6a609afad16def14795f260b8 100644 --- a/predtuner/modeledapp.py +++ b/predtuner/modeledapp.py @@ -5,6 +5,7 @@ import pickle from pathlib import Path from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, Union +import matplotlib.pyplot as plt import numpy as np import pandas as pd import torch @@ -192,6 +193,8 @@ class QoSModelP1(IQoSModel): qos_metric: Callable[[torch.Tensor], float], storage: PathLike = None, ) -> None: + from torch.nn.functional import softmax + super().__init__() self.app = app self.output_f = tensor_output_getter @@ -387,32 +390,89 @@ class ApproxModeledTuner(ApproxTuner): qos_keep_threshold=qos_keep_threshold, is_threshold_relative=is_threshold_relative, take_best_n=take_best_n, - test_configs=test_configs, - cost_model=cost_model, - qos_model=qos_model, + test_configs=False, # Test configs below by ourselves + app_kwargs={"cost_model": cost_model, "qos_model": qos_model}, ) if validate_configs is None and qos_model != "none": msg_logger.info( 'Validating configurations due to using qos model "%s"', qos_model ) - self.validate_configs_(self.best_configs) + self.best_configs = self._update_configs(self.best_configs, False) elif validate_configs: msg_logger.info("Validating configurations as user requested") - self.validate_configs_(self.best_configs) + self.best_configs = self._update_configs(self.best_configs, False) + if test_configs: + msg_logger.info("Calibrating configurations on test inputs") + self.best_configs = self._update_configs(self.best_configs, True) return ret - def validate_configs_(self, configs: List[ValConfig]): + def _update_configs(self, configs: List[ValConfig], test_mode: bool): + from copy import deepcopy + from tqdm import tqdm + assert self.keep_threshold is not None + if not configs: + msg_logger.info("No configurations found.") + return [] + ret_configs = [] + total_error = 0 for cfg in tqdm(configs, leave=False): - cfg: ValConfig - if cfg.validated_qos is not None: - continue - cfg.validated_qos, _ = self.app.measure_qos_cost(cfg.knobs, False) - msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {cfg.test_qos} (mean)") + cfg = deepcopy(cfg) + qos, _ = self.app.measure_qos_cost(cfg.knobs, test_mode) + if test_mode: + assert cfg.test_qos is None + cfg.test_qos = qos + msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {qos} (mean)") + else: + assert cfg.validated_qos is None + cfg.validated_qos = qos + msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {qos} (mean)") + total_error += abs(cfg.qos - qos) + if qos > self.keep_threshold: + ret_configs.append(cfg) + else: + msg_logger.debug("Config removed") + mean_err = total_error / len(configs) + if test_mode: + msg_logger.info("QoS mean abs difference of calibration: %f", mean_err) + else: + msg_logger.info("QoS mean abs difference of validation: %f", mean_err) + msg_logger.info("%d of %d configs remain", len(ret_configs), len(configs)) + return ret_configs + + def plot_configs( + self, show_qos_loss: bool = False, connect_best_points: bool = False + ) -> plt.Figure: + if not self.tuned: + raise RuntimeError( + f"No tuning session has been run; call self.tune() first." + ) - def _get_app_kwargs(self, cost_model: str, qos_model: str): - return {"cost_model": cost_model, "qos_model": qos_model} + def get_points(confs, validated): + def qos_speedup(conf): + return conf.validated_qos if validated else conf.qos, conf.speedup + + sorted_points = np.array( + sorted([qos_speedup(c) for c in confs], key=lambda p: p[0]) + ).T + if show_qos_loss: + sorted_points[0] = self.baseline_qos - sorted_points[0] + return sorted_points + + fig, ax = plt.subplots() + kept_confs = get_points(self.kept_configs, False) + best_confs = get_points(self.best_configs, False) + best_confs_val = get_points(self.best_configs, True) + ax.plot(kept_confs[0], kept_confs[1], "o", label="valid") + mode = "-o" if connect_best_points else "o" + ax.plot(best_confs[0], best_confs[1], mode, label="best") + mode = "-o" if connect_best_points else "o" + ax.plot(best_confs_val[0], best_confs_val[1], mode, label="best_validated") + ax.set_xlabel("QoS Loss" if show_qos_loss else "QoS") + ax.set_ylabel("Speedup (x)") + ax.legend() + return fig @classmethod def _get_config_class(cls) -> Type[Config]: diff --git a/predtuner/torchapp.py b/predtuner/torchapp.py index 33ceaad991b005d632b48497230ee28744c9b2df..dac293f4201fa5bc5930a32f846937f7643ddc4e 100644 --- a/predtuner/torchapp.py +++ b/predtuner/torchapp.py @@ -153,6 +153,7 @@ class TorchApp(ModeledApp, abc.ABC): target = move_to_device_recursively(target, self.device) qos = self.tensor_to_qos(tensor_output[begin:end], target) qoses.append(qos) + begin = end return self.combine_qos(np.array(qoses)) p1_storage = self.model_storage / "p1.pkl" if self.model_storage else None