diff --git a/doc/getting_started.rst b/doc/getting_started.rst index 75381aa720fdee717d20db115ecbc74e73deff4c..867707bf1fdcb9178f0320e757b3e8a3ca09b1e7 100644 --- a/doc/getting_started.rst +++ b/doc/getting_started.rst @@ -1,2 +1,133 @@ Getting Started =================== + +This guide can help you start working with PredTuner. + +Installation +------------ + +Install PredTuner from source using `pip`: + +.. code-block:: shell + + pip install -e . + +PredTuner will also be available on PyPi in the future after we publish the first release. + +Tuning a PyTorch DNN +-------------------- + +PredTuner can tune any user-defined application, +but it is optimized for tuning DNN applications defined in PyTorch. + +We will use models predefined in PredTuner for demonstration purposes. +Download pretrained VGG16 model parameters and CIFAR10 dataset from `here +<https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_. +After extraction, there should be a :code:`model_params/` folder in current directory. + +Load the tuning and test subsets of CIFAR10 dataset, and create a pretrained VGG16 model: + +.. code-block:: python + + from pathlib import Path + import predtuner as pt + from predtuner.model_zoo import CIFAR, VGG16Cifar10 + + prefix = Path("model_params/vgg16_cifar10") + tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin") + tune_loader = DataLoader(tune_set, batch_size=500) + test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin") + test_loader = DataLoader(test_set, batch_size=500) + + module = VGG16Cifar10() + module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar")) + +PredTuner provides a logging mechanism. +While not required, it's recommended that you set up the logger output into a file: + +.. code-block:: python + + msg_logger = pt.config_pylogger(output_dir="vgg16_cifar10/", verbose=True) + +For each tuning task, both a tuning dataset and a test dataset is required. +The tuning dataset is used to evaluate the accuracy of application in the autotuning stage, +while the test dataset is used to evaluate configurations found in autotuning. +This is similar to the split between training and validation set in machine learning tasks. +In this case, both tuning and test datasets contain 5000 images. + +Create an instance of :code:`TorchApp` for tuning PyTorch DNN: + +.. code-block:: python + + app = pt.TorchApp( + "TestTorchApp", # Application name -- can be anything + module, + tune_loader, + test_loader, + knobs=pt.get_knobs_from_file(), + tensor_to_qos=pt.accuracy, + model_storage_folder="vgg16_cifar10/", + ) + +PredTuner provides :code:`TorchApp`, which is specialized for the use scenario of tuning PyTorch DNNs. +In addition, two more functions from PredTuner are used: + +:code:`pt.accuracy` is the *classification accuracy* metric, +which receives the probability distribution output from the VGG16 model, +compare it to the groundtruth in the dataset, +and returns a scalar between 0 and 100 for the classification accuracy + +:code:`pt.get_knobs_from_file()` returns a set of approximations preloaded in PredTuner, +which are applied to :code:`torch.nn.Conv2d` layers. +See ??? for these approximations and how to define your own approximations. + +Now we can obtain a tuner object from the application and start tuning. +We will keep configurations that don't exceed 3% loss of accuracy, +but encourage the tuner to find configurations with loss of accuracy below 2.1%. + +.. code-block:: python + + tuner = app.get_tuner() + tuner.tune( + max_iter=100, + qos_tuner_threshold=2.1, # QoS threshold to guide tuner into + qos_keep_threshold=3.0, # QoS threshold for which we actually keep the thresholds + is_threshold_relative=True, # Thresholds are relative to baseline -- baseline_acc - 2.1 + perf_model="perf_linear", # Use linear performance predictor + ) + +:code:`max_iter` defines the number of iterations to use in autotuning. +100 iterations is for demonstration; in practice, +at least 10000 iterations are necessary on VGG16-sized models to converge to a set of good configurations. + +Saving Tuning Results +--------------------- + +Now the :code:`tuner` object holds the tuning results, +we can export it into a json file, +and visualize all configurations in a figure: + +.. code-block:: python + + tuner.dump_configs("vgg16_cifar10/configs.json", best_only=False) + fig = tuner.plot_configs(show_qos_loss=True) + fig.savefig("vgg16_cifar10/configs.png") + +PredTuner will also automatically mark out `Pareto-optimal +<https://en.wikipedia.org/wiki/Pareto_efficiency>`_ +configurations. +These are called "best" configurations (:code:`tuner.best_configs`), +in contrast to "valid" configurations which are the configurations that satisfy our accuracy requirements +(:code:`tuner.kept_configs`). + +Within 100 iterations, PredTuner should find 30~50 valid configurations. +The generated figure should look like this: + +.. image:: tuning_result.png + + +Loading Tuning Results +---------------------- + +TODO: TODO + diff --git a/doc/index.rst b/doc/index.rst index 6866a98b3d779063329219cd2c49f8b2c6feb41d..b79d28a42bd60549b7e2921b29854948e0aece48 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -9,27 +9,21 @@ PredTuner performs autotuning on approximation choices for a program using an error-predictive proxy instead of executing the program, to greatly speedup autotuning while getting results of comparable quality. -PredTuner is a contribution of [ApproxTuner] -(https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations). - -Short-term Goals -- Measure accuracy impact of approximations -- Obtain a tuned, approximated CNN in <5 lines of code -- Easy to manage multiple approximation configs -- Easy to load and manage prior tuning results -- Flexible retraining support -Possible Long-term Goals -- High-performance implementations of approximate layers -- Allow users to register their own approximations -- Support for other frameworks: TF, ONNX, JAX +PredTuner is a main component of `ApproxTuner +<https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations>`_. -Documentation -------------- -.. only:: html +Solution for Efficient Approximation Autotuning +----------------------------------------------- + +- Start a tuning session in 10 lines of code +- Deep integration with PyTorch for DNN supports +- Multiple levels of APIs for generality and ease-of-use +- Effective accuracy prediction models +- Easily store and visualize tuning results in many formats - :Release: |version| - :Date: |today| +Documentation +------------- .. toctree:: :maxdepth: 1 @@ -41,6 +35,3 @@ Indices and tables ------------------ * :ref:`genindex` -* :ref:`modindex` -* :ref:`search` -* :ref:`glossary` diff --git a/doc/tuning_result.png b/doc/tuning_result.png new file mode 100644 index 0000000000000000000000000000000000000000..d3854041d97ab51be43aa08e563f1850bba19f48 Binary files /dev/null and b/doc/tuning_result.png differ diff --git a/examples/tune_vgg16_cifar10.py b/examples/tune_vgg16_cifar10.py index eca7082b92b6f80ea674c5bffec855d37a35f9cf..9530bc0c66a38d2607a120dab7352d0988a669ce 100644 --- a/examples/tune_vgg16_cifar10.py +++ b/examples/tune_vgg16_cifar10.py @@ -17,9 +17,9 @@ msg_logger = config_pylogger(output_dir="/tmp", verbose=True) # TODO: you should use all (5000) images for actual tuning. prefix = Path("model_params/vgg16_cifar10") tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin") -tune_loader = DataLoader(Subset(tune_set, range(500)), batch_size=500) +tune_loader = DataLoader(tune_set, batch_size=500) test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin") -test_loader = DataLoader(Subset(test_set, range(500)), batch_size=500) +test_loader = DataLoader(test_set, batch_size=500) # Load checkpoint for VGG16 (CIFAR10) module = VGG16Cifar10() @@ -38,17 +38,17 @@ app = TorchApp( model_storage_folder="tuner_results/vgg16_cifar10", ) # This is how to measure baseline accuracy -- {} means no approximation -baseline, _ = app.measure_qos_perf({}, False) +baseline, _ = app.measure_qos_cost({}, False) # Get a tuner object and start tuning! tuner = app.get_tuner() tuner.tune( - max_iter=100, # TODO: In practice, use at least 5000, or 10000 + max_iter=500, # TODO: In practice, use at least 5000, or 10000 qos_tuner_threshold=2.1, # QoS threshold to guide tuner into qos_keep_threshold=3.0, # QoS threshold for which we actually keep the thresholds is_threshold_relative=True, # Thresholds are relative to baseline -- baseline_acc - 2.1 - take_best_n=50, # Take 50 "best" configs - perf_model="perf_linear", # Use linear performance predictor - qos_model="qos_p1", # Use P1 QoS predictor + cost_model="cost_linear", # Use linear performance predictor ) # Save configs here when you're done tuner.dump_configs("tuner_results/vgg16_cifar10_configs.json") +fig = tuner.plot_configs(show_qos_loss=True) +fig.savefig("tuner_results/vgg16_cifar10_configs.png") \ No newline at end of file diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py index a6513cf19dea61fa17d6e541d8fa56b568b1c336..b34b1a1aa13b83c610b86506be859d9c6294e9a5 100644 --- a/predtuner/approxapp.py +++ b/predtuner/approxapp.py @@ -222,7 +222,7 @@ class ApproxTuner(Generic[T]): taken_idx = is_pareto_efficient(points, take_n=n) return [configs[i] for i in taken_idx] - def dump_configs(self, filepath: PathLike): + def dump_configs(self, filepath: PathLike, best_only: bool = True): import os from jsonpickle import encode @@ -233,8 +233,9 @@ class ApproxTuner(Generic[T]): ) filepath = Path(filepath) os.makedirs(filepath.parent, exist_ok=True) + confs = self.best_configs if best_only else self.kept_configs with filepath.open("w") as f: - f.write(encode(self.best_configs, indent=2)) + f.write(encode(confs, indent=2)) def plot_configs( self, show_qos_loss: bool = False, connect_best_points: bool = False