diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf1ad99736a9c4de6840f67327ea63a627acad1b
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,12 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+sphinx:
+  configuration: doc/conf.py
+python:
+  version: 3.8
+  install:
+    - requirements: doc/requirements.txt
diff --git a/LICENSE b/LICENSE
index f263dcbd49fa3b2cc5b1339adb116109fb970863..7eeac83a8c557b8cc45979a47e1238c6be38b653 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,34 @@
-MIT License
+University of Illinois/NCSA Open Source License
 
-Copyright (c) 2021 Yifan Zhao
+Copyright (c) 2020 Illinois LLVM Group. All rights reserved.
 
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+Developed by: The Illinois LLVM Group
+              University of Illinois at Urbana Champaign
+              https://hpvm.cs.illinois.edu
 
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal with the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
 
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimers.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimers in the
+  documentation and/or other materials provided with the distribution.
+
+* Neither the names of [fullname], [project] nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this Software without specific prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+THE SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index 784aebba1fc2bdeef0c6b7fbdd2070330d75f53f..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Autotuning and Predictive Autotuning
-
-`predtuner` performs autotuning on program approximation knobs using an error-predictive proxy
-in place of the original program, to greatly speedup autotuning while getting results
-comparable in quality. `current_version == 0.3`.
-
-## Requirements
-
-`predtuner` requires `python >= 3.7` and `pip`, preferrably `pip >= 20`.
-To install from PyPI (currently TestPyPI), use
-
-```bash
-python -m pip install -i https://test.pypi.org/simple/ predtuner
-```
-
-### Install from Source
-
-Alternatively, you can install this package from source.
-At the root directory of this repository, do:
-
-```bash
-python -m pip install -e ./
-```
-
-With the flag `-e`, any changes to code in this repo is reflected on the installed version automatically.
-It can be omitted if you don't intend to modify the code in this package.
-
-## Getting Started
-
-The documentation page contains a full tutorial.
-Build the documentation by:
-
-```bash
-pip install sphinx sphinx_rtd_theme sphinx_autodoc_typehints
-cd doc
-make html
-```
-
-The documentation page will be created as `doc/build/html/index.html`.
-You can open this in the browser and browse to "Getting Started" section.
-
-### Model Data for Example / Testing
-
-`predtuner` contains 10 demo models which are also used in tests.
-
-- Download and extract [this](https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u/view?usp=sharing) file containing all 10 models, for testing purposes.
-- The "Getting Started" example on the documentation page only uses VGG16-CIFAR10.
-  If you don't need the other models, get the data for VGG16-CIFAR10
-  [here](https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing).
-
-In either case, there should be a `model_params/` folder at the root of repo after extraction.
-
-## Tuning with HPVM Binary
-
-This branch (`hpvm`) contains beta support for HPVM binaries.
-Please refer to `examples/tune_hpvm_bin.py` for an example with explanations.
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e27bb95303c3accbae16bcb849c7ab6ceccbe8f
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,15 @@
+Autotuning and Predictive Autotuning
+====================================
+
+PredTuner performs autotuning on program approximation knobs using an error-predictive proxy
+in place of the original program, to greatly speedup autotuning while getting results
+comparable in quality. ``current_version == 0.3``.
+
+Read our `documentation here <https://predtuner.readthedocs.io/en/latest/index.html>`_
+for how to install and use PredTuner.
+
+Tuning with HPVM Binary
+-----------------------
+
+This branch (`hpvm`) contains beta support for HPVM binaries.
+Please refer to `examples/tune_hpvm_bin.py` for an example with explanations.
diff --git a/doc/README.md b/doc/README.md
index 5e0af1e3f90895ee5cdbc2927ccedf8970053c32..cd9b7b68846b9c164978a10f8e8ea9bc9d846ab1 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -6,7 +6,7 @@ We use Sphinx for generating the API and reference documentation.
 Install the following Python packages needed to build the documentation by entering:
 
 ```bash
-pip install sphinx sphinx-autodoc-typehints sphinx-rtd-theme
+pip install -r requirements.txt
 ```
 
 To build the HTML documentation, enter::
diff --git a/doc/_static/result_no_model.png b/doc/_static/result_no_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..6f29f7d6bc8aa8237cd0588506d4eb7a285a9d9a
Binary files /dev/null and b/doc/_static/result_no_model.png differ
diff --git a/doc/_static/result_with_model.png b/doc/_static/result_with_model.png
new file mode 100644
index 0000000000000000000000000000000000000000..095b6b76981e230a7d60c16f31928e56baaeeef6
Binary files /dev/null and b/doc/_static/result_with_model.png differ
diff --git a/doc/conf.py b/doc/conf.py
index ad8cee166b4df9acec13bca5dd73eccd4db31a9e..3a97e1483cb8b0a91ebdfaafd15e6858c0e2f01e 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -1,14 +1,13 @@
 from datetime import date
-import sphinx_rtd_theme
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
-import os
 import sys
-
-sys.path.insert(0, os.path.abspath(".."))
+from pathlib import Path
+this_folder = Path(__file__).parent
+sys.path.insert(0, (this_folder / "..").absolute().as_posix())
 
 # General configuration
 # ---------------------
@@ -18,16 +17,15 @@ sys.path.insert(0, os.path.abspath(".."))
 extensions = [
     "sphinx.ext.autosummary",
     "sphinx.ext.autodoc",
-    "sphinx_autodoc_typehints",
     "sphinx.ext.coverage",
     "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
     "sphinx.ext.todo",
     "sphinx.ext.viewcode",
-    "numpydoc",
 ]
-always_document_param_types = True
+
+autodoc_typehints = "description"
 
 # generate autosummary pages
 autosummary_generate = True
@@ -48,48 +46,27 @@ master_doc = "index"
 project = "PredTuner"
 copyright = f"2020-{date.today().year}, University of Illinois"
 
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-# today = ''
-# Else, today_fmt is used as the format for a strftime call.
-# today_fmt = '%B %d, %Y'
-
-# List of documents that shouldn't be included in the build.
-# unused_docs = ['']
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-# add_function_parentheses = True
-
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
 add_module_names = False
 
-# show_authors = True
-
 # The name of the Pygments (syntax highlighting) style to use.
 # pygments_style = 'friendly'
 pygments_style = "sphinx"
 
 # A list of prefixs that are ignored when creating the module index. (new in Sphinx 0.6)
-# modindex_common_prefix = ["networkx."]
-
-# doctest_global_setup = "import networkx as nx"
+# modindex_common_prefix = []
 
 # Options for HTML output
 # -----------------------
 
-
-html_theme = "sphinx_rtd_theme"
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
+html_theme = "pydata_sphinx_theme"
 html_theme_options = {
-    "canonical_url": "https://networkx.org/documentation/stable/",
-    "navigation_depth": 3,
-    "logo_only": True,
+    # "github_url": "https://gitlab.engr.illinois.edu/llvm/hpvm-beta",
+    "show_prev_next": False,
+    "search_bar_position": "sidebar",
 }
 
-# html_logo = "_static/networkx_logo.svg"
-
 # The style sheet to use for HTML and HTML Help pages. A file of that name
 # must exist either in Sphinx' static/ path, or in one of the custom paths
 # given in html_static_path.
@@ -104,20 +81,6 @@ html_static_path = ["_static"]
 # using the given strftime format.
 html_last_updated_fmt = "%b %d, %Y"
 
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-# html_use_smartypants = True
-
-# Content template for the index page.
-# html_index = 'index.html'
-
-# Custom sidebar templates, maps page names to templates.
-# html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# templates.
-# html_additional_pages = {'': ''}
-
 # If true, the reST sources are included in the HTML build as _sources/<name>.
 html_copy_source = False
 
@@ -129,9 +92,6 @@ latex_engine = "xelatex"
 # The paper size ('letter' or 'a4').
 latex_paper_size = "letter"
 
-# The font size ('10pt', '11pt' or '12pt').
-# latex_font_size = '10pt'
-
 latex_appendices = ["tutorial"]
 
 # Intersphinx mapping
@@ -147,10 +107,3 @@ intersphinx_mapping = {
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
 default_role = "obj"
-
-numpydoc_show_class_members = False
-
-
-def setup(app):
-    app.add_css_file("custom.css")
-    app.add_js_file("copybutton.js")
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 5d8472a5a224b6677df4f080bf6309bff6bae016..d0e1305cb601f9ae6a2b26d869e38091851302e2 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -6,24 +6,43 @@ This guide can help you start working with PredTuner.
 Installation
 ------------
 
-Install PredTuner from source using `pip`:
+* PredTuner requires ``python >= 3.6`` and ``pip``, preferrably ``pip >= 20``.
+
+To install this package from source, at the root directory of this repository, do:
 
 .. code-block:: shell
 
-   pip install -e .
+   python3 -m pip install -e ./
 
 PredTuner will also be available on PyPi in the future after we publish the first release.
 
+* With the flag ``-e``, any changes to code in this repo is reflected on the installed version automatically.
+  It can be omitted if you don't intend to modify the code in this package.
+
+Model Data for Example / Testing
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PredTuner contains 10 demo models which are also used in tests.
+
+* Download and extract `this <https://drive.google.com/file/d/1V_yd9sKcZQ7zhnO5YhRpOsaBPLEEvM9u/view?usp=sharing>`_ file containing all 10 models, for testing purposes.
+* In the tutorial below, we will only use VGG16-CIFAR10.
+  If you don't need the other models, get the data for VGG16-CIFAR10
+  `here <https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_.
+
+In either case, there should be a ``model_params/`` folder at the root of repo after extraction.
+
 Tuning a PyTorch DNN
 --------------------
 
+* The code used in the following example can be found at ``examples/tune_vgg16_cifar10.py``.
+
 PredTuner can tune any user-defined application,
 but it is optimized for tuning DNN applications defined in PyTorch.
 
 We will use models predefined in PredTuner for demonstration purposes.
 Download pretrained VGG16 model parameters and CIFAR10 dataset from `here
 <https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_.
-After extraction, there should be a `model_params/` folder in current directory.
+After extraction, there should be a ``model_params/`` folder in current directory.
 
 Load the tuning and test subsets of CIFAR10 dataset, and create a pretrained VGG16 model:
 
@@ -55,7 +74,7 @@ while the test dataset is used to evaluate configurations found in autotuning.
 This is similar to the split between training and validation set in machine learning tasks.
 In this case, both tuning and test datasets contain 5000 images.
 
-Create an instance of `TorchApp` for tuning PyTorch DNN:
+Create an instance of `~predtuner.torchapp.TorchApp` for tuning PyTorch DNN:
 
 .. code-block:: python
 
@@ -69,31 +88,33 @@ Create an instance of `TorchApp` for tuning PyTorch DNN:
     model_storage_folder="vgg16_cifar10/",
   )
 
-PredTuner provides `TorchApp`, which is specialized for the use scenario of tuning PyTorch DNNs.
+PredTuner provides `~predtuner.torchapp.TorchApp`,
+which is specialized for the use scenario of tuning PyTorch DNNs.
 In addition, two more functions from PredTuner are used:
 
-`pt.accuracy` is the *classification accuracy* metric,
+:py:meth:`pt.accuracy <predtuner.torchutil.accuracy>`
+is the *classification accuracy* metric,
 which receives the probability distribution output from the VGG16 model,
 compare it to the groundtruth in the dataset,
-and returns a scalar between 0 and 100 for the classification accuracy
+and returns a scalar between 0 and 100 for the classification accuracy.
 
-`pt.get_knobs_from_file()` returns a set of approximations preloaded in PredTuner,
+:py:meth:`pt.get_knobs_from_file <predtuner.approxes.get_knobs_from_file>`
+returns a set of approximations preloaded in PredTuner,
 which are applied to `torch.nn.Conv2d` layers.
-See ??? for these approximations and how to define custom approximations.
 
 Now we can obtain a tuner object from the application and start tuning.
 We will keep configurations that don't exceed 3% loss of accuracy,
-but encourage the tuner to find configurations with loss of accuracy below 2.1%.
+but encourage the tuner to find configurations with loss of accuracy below 2.0%.
 
 .. code-block:: python
 
   tuner = app.get_tuner()
   tuner.tune(
-    max_iter=500,
-    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    max_iter=1000,
+    qos_tuner_threshold=2.0,  # QoS threshold to guide tuner into
     qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
-    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
-    take_best_n=50,
+    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.0
+    take_best_n=20,
     cost_model="cost_linear",  # Use linear cost predictor
   )
 
@@ -101,8 +122,11 @@ but encourage the tuner to find configurations with loss of accuracy below 2.1%.
 e.g., here it refers to the accuracy of DNN over given datasets.
 We will be using the term QoS throughout the tutorials.
 
+:py:meth:`tuner.tune <predtuner.modeledapp.ApproxModeledTuner.tune>`
+is the main method for running a tuning session.
+It accepts a few parameters which controls the behavior of tuning.
 `max_iter` defines the number of iterations to use in autotuning.
-Within 500 iterations, PredTuner should find about 200 valid configurations.
+Within 1000 iterations, PredTuner should find about 200 valid configurations.
 PredTuner will also automatically mark out `Pareto-optimal
 <https://en.wikipedia.org/wiki/Pareto_efficiency>`_
 configurations.
@@ -111,7 +135,7 @@ in contrast to "valid" configurations which are the configurations that satisfy
 (`tuner.kept_configs`).
 `take_best_n` allows taking some extra close-optimal configurations in addition to Pareto-optimal ones.
 
-500 iterations is for demonstration; in practice,
+1000 iterations is for demonstration; in practice,
 at least 10000 iterations are necessary on VGG16-sized models to converge to a set of good configurations.
 Depending on hardware performance, this tuning should take several minutes to several tens of minutes.
 
@@ -130,7 +154,8 @@ and visualize all configurations in a figure:
 
 The generated figure should look like this:
 
-.. image:: tuning_result.png
+.. image:: _static/result_no_model.png
+   :target: _static/result_no_model.png
 
 where the blue points shows the QoS and speedup of all valid configurations,
 and the "best" configurations are marked out in orange.
@@ -148,11 +173,11 @@ To do that, simply use the argument `qos_model` when calling `tuner.tune()`:
 
   tuner = app.get_tuner()
   tuner.tune(
-    max_iter=500,
-    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    max_iter=1000,
+    qos_tuner_threshold=2.0,  # QoS threshold to guide tuner into
     qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
-    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
-    take_best_n=50,
+    is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.0
+    take_best_n=20,
     cost_model="cost_linear",  # Use linear cost predictor
     qos_model="qos_p1"
   )
@@ -162,3 +187,17 @@ when it learns about the behavior of each knob on each operator (DNN layer).
 Because the configurations will end up with predicted QoS values after tuning,
 this will add a *validation* stage at the end of tuning where the QoS of best configurations are empirically measured,
 and the bad ones are removed.
+
+Following the procedure above to plot a figure of the configurations,
+the generated figure should look like this,
+with one extra subfigure (middle) comparing the predicted and measured QoS.
+
+.. image:: _static/result_with_model.png
+   :target: _static/result_with_model.png
+
+----------------------------------------------------------
+
+This concludes the tutorial for installing and using PredTuner.
+What we have just used is the PyTorch API of PredTuner.
+:doc:<reference/index> shows the reference of this API along with two sets of lower-level APIs
+that allows tuning applications that are not PyTorch DNN.
diff --git a/doc/reference/approx-app.rst b/doc/reference/approx-app.rst
new file mode 100644
index 0000000000000000000000000000000000000000..20d6e321b7526cad7062c90687073b33d569e76e
--- /dev/null
+++ b/doc/reference/approx-app.rst
@@ -0,0 +1,14 @@
+General Application Autotuning API
+==================================
+
+.. autoclass:: predtuner.approxapp.ApproxApp
+   :members:
+
+.. autoclass:: predtuner.approxapp.ApproxTuner
+   :members:
+
+.. autoclass:: predtuner.approxapp.ApproxKnob
+   :members:
+
+.. autoclass:: predtuner.approxapp.Config
+   :members:
diff --git a/doc/reference/index.rst b/doc/reference/index.rst
index 2a3a0a68b70eb678e956900f9a7291e11bc8fbe6..bcffe6c75b357705c9bc85fc1daac0323282b1cc 100644
--- a/doc/reference/index.rst
+++ b/doc/reference/index.rst
@@ -1,11 +1,15 @@
-PyTorch Autotuning API
-======================
+PredTuner Autotuning API
+========================
 
-.. autoclass:: predtuner.torchapp.TorchApp
-   :members:
-   :undoc-members:
+:doc:`pytorch-app` documents a high-level API for autotuning PyTorch Module.
 
-.. autoclass:: predtuner.modeledapp.ApproxModeledTuner
-   :members:
-   :inherited-members:
-   :undoc-members:
+PredTuner also supports predictive tuning of general applications that are not PyTorch Module,
+or even empirical tuning of general application that doesn't support predictive models.
+These lower-level APIs are documented in :doc:`modeled-app` and :doc:`approx-app` respectively.
+
+.. toctree::
+   :maxdepth: 1
+
+   pytorch-app
+   modeled-app
+   approx-app
diff --git a/doc/reference/modeled-app.rst b/doc/reference/modeled-app.rst
new file mode 100644
index 0000000000000000000000000000000000000000..458ffc38bb158559b2e75d0deacdec8e61effe19
--- /dev/null
+++ b/doc/reference/modeled-app.rst
@@ -0,0 +1,41 @@
+Predictive (Modeled) Autotuning API
+===================================
+
+.. autoclass:: predtuner.modeledapp.ModeledApp
+   :show-inheritance:
+   :members:
+
+.. autoclass:: predtuner.modeledapp.ApproxModeledTuner
+   :show-inheritance:
+   :members:
+
+.. autoclass:: predtuner.modeledapp.ValConfig
+   :show-inheritance:
+   :members:
+
+Predictive Model Interface
+----------------------------
+
+.. autoclass:: predtuner.modeledapp.IQoSModel
+   :members:
+
+.. autoclass:: predtuner.modeledapp.ICostModel
+   :members:
+
+Predefined Predictive Models
+----------------------------
+
+Below is a list of cost and QoS models already defined:
+
+* `predtuner.modeledapp.LinearCostModel`
+* `predtuner.modeledapp.QoSModelP1`
+* `predtuner.modeledapp.QoSModelP2`
+
+.. autoclass:: predtuner.modeledapp.LinearCostModel
+   :show-inheritance:
+
+.. autoclass:: predtuner.modeledapp.QoSModelP1
+   :show-inheritance:
+
+.. autoclass:: predtuner.modeledapp.QoSModelP2
+   :show-inheritance:
diff --git a/doc/reference/pytorch-app.rst b/doc/reference/pytorch-app.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4bd7e3d8f24d12c9fe53b54145c586d2d6a6f0e1
--- /dev/null
+++ b/doc/reference/pytorch-app.rst
@@ -0,0 +1,17 @@
+PyTorch Autotuning API
+======================
+
+.. autoclass:: predtuner.torchapp.TorchApp
+   :show-inheritance:
+   :members: get_tuner
+
+.. autofunction:: predtuner.approxes.get_knobs_from_file
+
+.. autofunction:: predtuner.torchutil.accuracy
+
+Defining New Approximation Knobs
+--------------------------------
+
+.. autoclass:: predtuner.torchapp.TorchApproxKnob
+   :show-inheritance:
+   :members:
diff --git a/doc/requirements.txt b/doc/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4dc1a6039b99571b2800077062ae9236d39c710
--- /dev/null
+++ b/doc/requirements.txt
@@ -0,0 +1,3 @@
+sphinx>=3.5
+pydata-sphinx-theme==0.5.2
+numpydoc>=1.1
\ No newline at end of file
diff --git a/doc/tuning_result.png b/doc/tuning_result.png
deleted file mode 100644
index 6210102982aefa78a796a7a8593fe766e802dbf9..0000000000000000000000000000000000000000
Binary files a/doc/tuning_result.png and /dev/null differ
diff --git a/examples/tune_vgg16_cifar10.py b/examples/tune_vgg16_cifar10.py
index ce8e5a8a019f988fe578c7983d0cd0c60e6ee108..63f030550cd5f0e531d3a58853cf0693a5fed91c 100644
--- a/examples/tune_vgg16_cifar10.py
+++ b/examples/tune_vgg16_cifar10.py
@@ -42,10 +42,11 @@ baseline, _ = app.measure_qos_cost({}, False)
 # Get a tuner object and start tuning!
 tuner = app.get_tuner()
 tuner.tune(
-    max_iter=500,  # TODO: In practice, use at least 5000, or 10000
-    qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+    max_iter=1000,  # TODO: In practice, use at least 5000, or 10000
+    qos_tuner_threshold=2.0,  # QoS threshold to guide tuner into
     qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the configurations
     is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
+    take_best_n=20,  # Take the best 20 configs (not just the "strictly" best ones)
     cost_model="cost_linear",  # Use linear performance predictor
     qos_model="qos_p1",  # Use P1 QoS predictor
 )
diff --git a/predtuner/__init__.py b/predtuner/__init__.py
index babe3e013b099b78047d3cea48fea562ac834c8e..8e36a00f81476b7de70bec8d412fbd9f0b83b60b 100644
--- a/predtuner/__init__.py
+++ b/predtuner/__init__.py
@@ -2,9 +2,9 @@ from ._logging import config_pylogger
 from .approxapp import ApproxApp, ApproxKnob, ApproxTuner
 from .approxes import get_knobs_from_file
 from .modeledapp import (
-    IPerfModel,
+    ICostModel,
     IQoSModel,
-    LinearPerfModel,
+    LinearCostModel,
     ModeledApp,
     QoSModelP1,
     QoSModelP2,
diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py
index db856fb1ca898e9109e323ed73a6f9096f27695f..6ada08df00eb84051e9d98e510d3017b3132e14c 100644
--- a/predtuner/approxapp.py
+++ b/predtuner/approxapp.py
@@ -27,6 +27,16 @@ TunerConfigT = Dict[int, int]
 
 
 class ApproxKnob:
+    r"""Basic definition of an approximation knob.
+    An approximation knob is an instance of a type of approximation;
+    for example, Perforated Convolution is a type of approximation,
+    while row-perforated convolution with stride 2 is a knob.
+
+    :param name: The name of this approximation knob. Must be unique throughout.
+    :param devices: The devices this knob can be applied on.
+           Default is `None` which means all devices are supported.
+    """
+
     def __init__(
         self, name: str, devices: List[str] = None, baseline_priority: int = None
     ):
@@ -35,6 +45,10 @@ class ApproxKnob:
         self.baseline_priority = baseline_priority
 
     def exists_on_device(self, device: str) -> bool:
+        """Returns True if this knob can be applied to an `ApproxApp` on device `device`.
+
+        :param device: The device to check for.
+        """
         if self.devices is None:
             return True
         return device in self.devices
@@ -57,50 +71,94 @@ class ApproxKnob:
 
 class ApproxApp(abc.ABC):
     """Generic approximable application with operator & knob enumeration,
-    and measures its own QoS and performance given a configuration.
-
-    Parameters
-    ----------
-    op_knobs:
-        a mapping from each operator (identified by str) to a list of applicable knobs.
+    and measures its own QoS and cost given a configuration.
+    (A configuration is a dictionary from operator name to a knob name.)
+    To use this class, inherit from it and implement `name` and `measure_qos_cost`.
+
+    :param op_knobs: a mapping from each operator (identified by str) to a list of applicable knobs.
+    :type op_knobs: Dict[str, List[ApproxKnob]]
+    :param target_device: the target device that this application should be tuned on.
+           Each knob has a number of devices it is supported on
+           (see `ApproxKnob.exists_on_device`)
+           and only knobs supported on `target_device` will be used for this application.
+    :type target_device: Optional[str]
+
+    :var baseline_knob: The baseline knob of this application.
+         This is derived by looking at all knobs defined in `op_knobs`
+         and deciding which is the baseline.
     """
 
     def __init__(
-        self, op_knobs: Dict[str, List[ApproxKnob]], tuning_device: str = None
+        self, op_knobs: Dict[str, List[ApproxKnob]], target_device: Optional[str] = None
     ) -> None:
         super().__init__()
         self.op_knobs = op_knobs
-        if tuning_device:
-            self.op_knobs = self._filter_knob_by_device(self.op_knobs, tuning_device)
+        if target_device:
+            self.op_knobs = self._filter_knob_by_device(self.op_knobs, target_device)
         # Also modifies self.op_knobs in place.
         self.baseline_knob = self._check_get_baseline_knob_(self.op_knobs)
 
-    @abc.abstractmethod
-    def measure_qos_cost(
-        self, with_approxes: KnobsT, is_test: bool
-    ) -> Tuple[float, float]:
-        pass
+    @property
+    def ops(self) -> List[str]:
+        """A list of operators in this application.
+
+        :rtype: List[str]
+        """
+        return list(self.op_knobs)
+
+    @property
+    def knobs(self) -> List[ApproxKnob]:
+        """A list of all unique knobs (see `ApproxKnob`)
+        applicable to operators in this application.
+
+        :rtype: List[ApproxKnob]
+        """
+        knob_sets = [set(knobs) for knobs in self.op_knobs.values()]
+        return list(set.union(*knob_sets))
 
     def get_tuner(self) -> "ApproxTuner":
-        """We implement this function. Sets up an ApproxTuner instance
-        which the user can directly call `tune()` on with opentuner parameters."""
+        """Sets up an ApproxTuner instance which the user can directly call
+        `tune()` on with opentuner parameters."""
         return ApproxTuner(self)
 
     @property
     @abc.abstractmethod
     def name(self) -> str:
-        """Name of application. Acts as an identifier in many places, so
-        the user should try to make it unique."""
+        """The name of this application.
+        Acts as an identifier in many places, so the user should try to make it unique.
+
+        :rtype: str
+        """
         return ""
 
-    @property
-    def ops(self) -> List[str]:
-        return list(self.op_knobs)
+    @abc.abstractmethod
+    def measure_qos_cost(
+        self, with_approxes: KnobsT, is_test: bool
+    ) -> Tuple[float, float]:
+        """Measures the QoS and cost (time, energy, ...) of a given configuration.
 
-    @property
-    def knobs(self) -> List[ApproxKnob]:
-        knob_sets = [set(knobs) for knobs in self.op_knobs.values()]
-        return list(set.union(*knob_sets))
+        :param with_approxes: The approximation configuration to measure QoS and cost for.
+        :param is_test: If True, uses a "test" dataset/mode that is held away from the tuner
+               during tuning; otherwise use "tune" dataset.
+               How the "tune" and "test" mode behave is up to the user to define.
+        """
+        pass
+
+    def add_baseline_to_knobs(self, approxes: KnobsT) -> KnobsT:
+        """For each operator not appearing in the keys of configuration `approxes`
+        (a dictionary), map it to the baseline (see `ApproxApp.baseline_knob`).
+
+        `measure_qos_cost` should call this on the incoming config
+        if you wish to be able to abbreviate the configuration
+        (for example, you can write `measure_qos_cost({})` to get the baseline QoS).
+        This ensures all operators are present when the config is sent to tuner.
+
+        :param approxes: the config to add baseline knobs to.
+        """
+        return {
+            op_name: approxes.get(op_name, self.baseline_knob.name)
+            for op_name in self.ops
+        }
 
     @staticmethod
     def _check_get_baseline_knob_(
@@ -124,14 +182,19 @@ class ApproxApp(abc.ABC):
             for op, knobs in op_knobs.items()
         }
 
-    def add_baseline_to_knobs(self, approxes: KnobsT):
-        return {
-            op_name: approxes.get(op_name, self.baseline_knob.name)
-            for op_name in self.ops
-        }
-
 
 class Config:
+    """An approximation configuration with its measurement results, including QoS and cost.
+
+    :param qos: The QoS of this config (measured on tuning mode, see `ApproxApp.measure_qos_cost`).
+    :param cost: The *relative* cost (time, energy, etc.) of this config
+           compared to the baseline config. This is essentially :math:`1 / speedup`.
+    :param knobs: The op-knob mapping in this configuration.
+    :param test_qos: The QoS of this config on test mode (see `ApproxApp.measure_qos_cost`).
+           This is optional as it is filled in only after the config-testing phase
+           (which can be opt out of). See `ApproxTuner.tune`.
+    """
+
     def __init__(
         self, qos: float, cost: float, knobs: KnobsT, test_qos: Optional[float] = None
     ) -> None:
@@ -148,22 +211,32 @@ class Config:
 T = TypeVar("T", bound=Config)
 
 
-# IOpenTuner is generic over the type of the config
+# ApproxTuner is generic over the type of the config
 # So that the user can use custom Config inherited from Config
 # (in which case they need to override `get_all_configs_from_db`).
 class ApproxTuner(Generic[T]):
+    """Supports tuning and holds all tuning results.
+    `ApproxTuner.tune` is the main method for tuning.
+
+    An instance of `ApproxTuner` can be obtained from `ApproxApp.get_tuner`.
+
+    :param app: the application to tune.
+    """
+
     def __init__(self, app: ApproxApp) -> None:
         self.app = app
         self._tuned = False
         self.all_configs = []
         self.kept_configs = []
+        self.best_configs_prefilter = []
         self.best_configs = []
         # The following will be filled after self.tune() is called
-        self.keep_threshold = None
-        self.baseline_qos = None
+        self.baseline_tune_qos, self.baseline_test_qos = None, None
+        self.tune_keep_threshold, self.test_keep_threshold = None, None
 
     @property
     def tuned(self) -> bool:
+        """Returns True if `tune` has been called at least once."""
         return self._tuned
 
     def tune(
@@ -177,6 +250,30 @@ class ApproxTuner(Generic[T]):
         app_kwargs: dict = None
         # TODO: more parameters + opentuner param forwarding
     ) -> List[T]:
+        """Runs a tuning session.
+
+        :param max_iter: Number of iterations to use in tuning.
+        :param qos_tuner_threshold: The QoS threshold that the tuner should aim for.
+               QoS is assumed to be a higher-better quantity.
+               This should be slightly tighter than `qos_keep_threshold`
+               to account for extra error when running on test dataset.
+        :param qos_keep_threshold: The QoS threshold beyond which we will keep the configuration.
+               By default it is equal to `qos_keep_threshold`.
+        :param is_threshold_relative: If True, the actual thresholds are considered to be
+               ``baseline_qos - given_threshold``.
+               This applies to `qos_tuner_threshold` and `qos_keep_threshold`.
+        :param take_best_n: Take the best :math:`n` configurations after tuning.
+               "Best" is defined as the configurations closest to the pareto curve
+               of the QoS-cost tradeoff space.
+               If `take_best_n` is None, only the configurations strictly on the
+               pareto curve are taken.
+        :param test_configs: If True, runs the configs on the test dataset,
+               filter the taken configs by `qos_keep_threshold`,
+               and fill the `test_qos` field of `Config`.
+        :param app_kwargs: Additional arguments to pass to
+               `ApproxApp.measure_qos_cost` during tuning.
+        """
+
         from opentuner.tuningrunmain import TuningRunMain
 
         from ._dbloader import read_opentuner_db
@@ -200,7 +297,7 @@ class ApproxTuner(Generic[T]):
             is_threshold_relative,
             app_kwargs or {},
         )
-        assert self.keep_threshold is not None
+        assert self.tune_keep_threshold is not None
         trm = TuningRunMain(tuner, opentuner_args)
         # TuningRunMain.__init__ initializes its own logger, so we'll override it and use ours
         override_opentuner_config()
@@ -219,53 +316,36 @@ class ApproxTuner(Generic[T]):
             for result, configuration in read_opentuner_db(opentuner_args.database)
         ]
         self.kept_configs = [
-            cfg for cfg in self.all_configs if cfg.qos > self.keep_threshold
+            cfg for cfg in self.all_configs if cfg.qos > self.tune_keep_threshold
         ]
-        self.best_configs = self.take_best_configs(self.kept_configs, take_best_n)
+        self.best_configs_prefilter = self._take_best_configs(
+            self.kept_configs, take_best_n
+        )
         msg_logger.info(
             "Tuning finished with %d configs in total, "
             "%d configs above keeping threshold, "
             "and %d configs selected on tradeoff curve",
             len(self.all_configs),
             len(self.kept_configs),
-            len(self.best_configs),
+            len(self.best_configs_prefilter),
         )
         if test_configs:
-            msg_logger.info("Calibrating configurations on test inputs")
-            self.best_configs = self.test_configs(self.best_configs)
+            msg_logger.info("Running configurations on test inputs")
+            # Also fills in the test QoS of self.best_configs_prefilter
+            self.best_configs = self._test_configs_(self.best_configs_prefilter)
+        else:
+            self.best_configs = self.best_configs_prefilter
         return self.best_configs
 
-    def test_configs(self, configs: List[Config]):
-        from copy import deepcopy
-
-        from tqdm import tqdm
-
-        assert self.keep_threshold is not None
-        if not configs:
-            return []
-        ret_configs = []
-        total_error = 0
-        for cfg in tqdm(configs, leave=False):
-            cfg = deepcopy(cfg)
-            assert cfg.test_qos is None
-            cfg.test_qos, _ = self.app.measure_qos_cost(cfg.knobs, True)
-            msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {cfg.test_qos} (mean)")
-            total_error += abs(cfg.qos - cfg.test_qos)
-            if cfg.test_qos > self.keep_threshold:
-                ret_configs.append(cfg)
-            else:
-                msg_logger.debug("Config removed")
-        mean_err = total_error / len(configs)
-        msg_logger.info("QoS mean abs difference of calibration: %f", mean_err)
-        return ret_configs
+    def dump_configs(self, filepath: PathLike, best_only: bool = True):
+        """Writes configuration to a JSON file.
 
-    @staticmethod
-    def take_best_configs(configs: List[T], n: Optional[int] = None) -> List[T]:
-        points = np.array([(c.qos, c.speedup) for c in configs])
-        taken_idx = is_pareto_efficient(points, take_n=n)
-        return [configs[i] for i in taken_idx]
+        :param filepath: The JSON file to write into.
+        :param best_only: If True, only writes the "best" configuration
+               (filtered after running on test dataset, if required).
+               Otherwise, writes all configurations within the given QoS threshold.
+        """
 
-    def dump_configs(self, filepath: PathLike, best_only: bool = True):
         import os
 
         from jsonpickle import encode
@@ -284,34 +364,134 @@ class ApproxTuner(Generic[T]):
         self,
         show_qos_loss: bool = False,
         connect_best_points: bool = False,
-        use_test_qos: bool = False,
     ) -> plt.Figure:
+        """Plots 1 or 2 QoS-vs-speedup scatter plot of configurations.
+
+        All kept configurations and all "best" configurations (before test-set filtering if any)
+        are always plotted in the first subplot.
+        If test-set filtering was used, the second subplot contains the "best" configurations
+        plotted twice, with tune-set and test-set QoS loss respectively.
+
+        :param show_qos_loss: If True, uses the loss of QoS (compared to the baseline)
+               instead of the absolute QoS in the first graph.
+               *This does not apply to the second graph* if it exists,
+               which always use QoS loss for ease of comparison.
+        """
+
         if not self.tuned:
             raise RuntimeError(
                 f"No tuning session has been run; call self.tune() first."
             )
+        # Without `ax` argument, this function returns if we can
+        # do the second plot or not.
+        dot_format = "-o" if connect_best_points else "o"
+        if self.plot_test_phase():
+            fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 6), dpi=300)
+            self.plot_kept_and_best(ax0, show_qos_loss)
+            self.plot_test_phase(ax1, dot_format)
+        else:
+            fig, ax0 = plt.subplots(1, 1, figsize=(6, 6), dpi=300)
+            self.plot_kept_and_best(ax0, show_qos_loss)
+        fig.tight_layout()
+        return fig
+
+    def plot_kept_and_best(self, ax: plt.Axes, show_qos_loss: bool):
+        kept_confs = self._config_qos_speedups(
+            self.kept_configs, "qos", show_qos_loss, False
+        )
+        best_confs = self._config_qos_speedups(
+            self.best_configs_prefilter, "qos", show_qos_loss, False
+        )
+        ax.plot(kept_confs[0], kept_confs[1], "o", label="Kept Configs")
+        ax.plot(best_confs[0], best_confs[1], "o", label="Best Configs")
+        self._set_xy_limit(ax, show_qos_loss)
+        if show_qos_loss:
+            rthres = self.baseline_tune_qos - self.tune_keep_threshold
+            self._draw_qos_line(ax, rthres, f"Relative threshold: {rthres:.2f}")
+            ax.set_xlabel("QoS Loss (tune dataset)")
+        else:
+            bqos, thres = self.baseline_tune_qos, self.tune_keep_threshold
+            self._draw_qos_line(ax, bqos, f"Baseline QoS: {bqos:.2f}")
+            self._draw_qos_line(ax, thres, f"Threshold: {thres:.2f}")
+            ax.set_xlabel("QoS (tune dataset)")
+        ax.set_ylabel("Speedup (x)")
+        ax.legend()
 
-        def qos_speedup(conf):
-            return conf.test_qos if use_test_qos else conf.qos, conf.speedup
-
-        def get_points(confs):
-            sorted_points = np.array(
-                sorted([qos_speedup(c) for c in confs], key=lambda p: p[0])
-            ).T
-            if show_qos_loss:
-                sorted_points[0] = self.baseline_qos - sorted_points[0]
-            return sorted_points
-
-        fig, ax = plt.subplots()
-        kept_confs = get_points(self.kept_configs)
-        best_confs = get_points(self.best_configs)
-        ax.plot(kept_confs[0], kept_confs[1], "o", label="valid")
-        mode = "-o" if connect_best_points else "o"
-        ax.plot(best_confs[0], best_confs[1], mode, label="best")
-        ax.set_xlabel("QoS Loss" if show_qos_loss else "QoS")
+    def plot_test_phase(
+        self, ax: plt.Axes = None, dot_format: str = "o", _tune_key: str = "qos"
+    ):
+        configs = self.best_configs_prefilter
+        tested = [conf.test_qos is not None for conf in configs]
+        can_plot = all(tested)
+        if not ax:
+            return can_plot
+        assert can_plot
+        tune_x, tune_y = self._config_qos_speedups(configs, _tune_key, True, False)
+        test_x, test_y = self._config_qos_speedups(configs, "test_qos", True, True)
+        ax.plot(tune_x, tune_y, dot_format, label="Tune-set QoS")
+        ax.plot(test_x, test_y, dot_format, label="Test-set QoS")
+        self._set_xy_limit(ax)
+        rthres = self.baseline_tune_qos - self.tune_keep_threshold
+        self._draw_qos_line(ax, rthres, f"Relative threshold: {rthres:.2f}")
+        ax.set_xlabel("QoS Loss")
         ax.set_ylabel("Speedup (x)")
         ax.legend()
-        return fig
+
+    def _set_xy_limit(self, ax: plt.Axes, show_qos_loss: bool = True):
+        xmin, ymin = ax.get_xlim()
+        if show_qos_loss:
+            ax.set_xlim(xmin=min(0, xmin))
+        ax.set_ylim(ymin=min(1, ymin))
+
+    def _config_qos_speedups(
+        self,
+        configs: List[Config],
+        qos_attr: str,
+        qos_loss: bool,
+        baseline_is_test: bool,
+    ):
+        def qos_speedup(conf: Config):
+            qos = getattr(conf, qos_attr)
+            bqos = (
+                self.baseline_test_qos if baseline_is_test else self.baseline_tune_qos
+            )
+            return bqos - qos if qos_loss else qos, conf.speedup
+
+        if not configs:
+            return np.zeros((2, 0))
+        sorted_points = np.array(
+            sorted([qos_speedup(c) for c in configs], key=lambda p: p[0])
+        ).T
+        return sorted_points
+
+    @staticmethod
+    def _draw_qos_line(ax: plt.Axes, qos: float, text: str):
+        ymin, ymax = ax.get_ylim()
+        ymid = (ymin + ymax) / 2
+        ax.axvline(qos)
+        ax.annotate(text, (qos, ymid), rotation=90, verticalalignment="center")
+
+    @staticmethod
+    def _take_best_configs(configs: List[T], n: Optional[int] = None) -> List[T]:
+        points = np.array([(c.qos, c.speedup) for c in configs])
+        taken_idx = is_pareto_efficient(points, take_n=n)
+        return [configs[i] for i in taken_idx]
+
+    def _test_configs_(self, configs: List[Config]):
+        from tqdm import tqdm
+
+        assert self.test_keep_threshold is not None
+        if not configs:
+            return []
+        total_error = 0
+        for cfg in tqdm(configs, leave=False):
+            assert cfg.test_qos is None
+            cfg.test_qos, _ = self.app.measure_qos_cost(cfg.knobs, True)
+            msg_logger.debug(f"Test dataset: {cfg.qos:.3f} -> {cfg.test_qos:.3f}")
+            total_error += abs(cfg.qos - cfg.test_qos)
+        mean_err = total_error / len(configs)
+        msg_logger.debug("QoS changed by %f on test dataset (mean abs diff)", mean_err)
+        return [cfg for cfg in configs if cfg.test_qos > self.test_keep_threshold]
 
     def _get_tuner_interface(
         self,
@@ -323,22 +503,33 @@ class ApproxTuner(Generic[T]):
         app_kwargs: dict,
     ) -> "TunerInterface":
         # By default, keep_threshold == tuner_threshold
-        self.keep_threshold = qos_keep_threshold or qos_tuner_threshold
+        keep_threshold = qos_keep_threshold or qos_tuner_threshold
         if is_threshold_relative:
-            self.baseline_qos, _ = self.app.measure_qos_cost({}, False)
-            qos_tuner_threshold = self.baseline_qos - qos_tuner_threshold
-            self.keep_threshold = self.baseline_qos - self.keep_threshold
+            self.baseline_tune_qos, _ = self.app.measure_qos_cost({}, False)
+            self.baseline_test_qos, _ = self.app.measure_qos_cost({}, True)
+            # Now abs threshold
+            qos_tuner_threshold = self.baseline_tune_qos - qos_tuner_threshold
+            # These are also abs thresholds
+            self.tune_keep_threshold = self.baseline_tune_qos - keep_threshold
+            self.test_keep_threshold = self.baseline_test_qos - keep_threshold
+            msg_logger.info(
+                "Using relative thresholds: baseline QoS = %f (tune set) and %f (test set)",
+                self.baseline_tune_qos,
+                self.baseline_test_qos,
+            )
+        else:
+            self.tune_keep_threshold = self.test_keep_threshold = keep_threshold
         opentuner_args.test_limit = max_iter
         msg_logger.info(
-            "Tuner QoS threshold: %f; keeping configurations with QoS >= %f",
+            "Tuner QoS threshold: %f; keeping configurations with QoS >= %f (tune dataset)",
             qos_tuner_threshold,
-            self.keep_threshold,
+            self.tune_keep_threshold,
         )
         return TunerInterface(
             opentuner_args,
             self.app,
             qos_tuner_threshold,
-            self.keep_threshold,
+            self.tune_keep_threshold,
             max_iter,
             **app_kwargs,
         )
@@ -400,7 +591,7 @@ class TunerInterface(MeasurementInterface):
         return manipulator
 
     def run(self, desired_result, input_, limit):
-        """Run a given configuration then return performance and accuracy."""
+        """Run a given configuration then return cost and QoS."""
         from opentuner.resultsdb.models import Result
 
         cfg = desired_result.configuration.data
diff --git a/predtuner/approxes/approxes.py b/predtuner/approxes/approxes.py
index 0125de74f71a8a57e0148682e528ac687450da4a..f81265231aae55a76594c0cd7adc7395a3723c4e 100644
--- a/predtuner/approxes/approxes.py
+++ b/predtuner/approxes/approxes.py
@@ -393,6 +393,21 @@ def get_knobs_from_file(
     filepath: PathLike = default_knob_file,
     extra_name_to_class: Dict[str, Type[TorchApproxKnob]] = None,
 ) -> Set[TorchApproxKnob]:
+    """get_knobs_from_file(filepath=default_knob_file, extra_name_to_class=None)
+
+    Constructs and returns a set of `TorchApproxKnob` from a knob declaration file.
+    `default_knob_file` points to a file that is contained in the predtuner package,
+    so just calling ``get_knobs_from_file()`` should provide a set of predefined knobs already.
+
+    :param filepath: the knob declaration file (JSON) to read from.
+    :param extra_name_to_class: a mapping from the name of the approximation to the
+           class (implementation) of the approximation.
+           If not given, only the builtin approximations will be considered
+           when parsing the declaration file.
+    :type extra_name_to_class: Dict[str, Type[TorchApproxKnob]]
+    :rtype: Set[TorchApproxKnob]
+    """
+
     import json
 
     extra_name_to_class = extra_name_to_class or {}
diff --git a/predtuner/modeledapp.py b/predtuner/modeledapp.py
index fa30d8e7ddd831c3fabc728867f06ca024f03cac..ce1e0d82cea0b9dc162883bf4d44bdb02c8843e1 100644
--- a/predtuner/modeledapp.py
+++ b/predtuner/modeledapp.py
@@ -1,6 +1,7 @@
 import abc
 import json
 import logging
+import os
 import pickle
 from pathlib import Path
 from typing import Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
@@ -17,70 +18,102 @@ msg_logger = logging.getLogger(__name__)
 
 
 class ModeledApp(ApproxApp, abc.ABC):
-    """Approximable application that inherits at least 1 interface for performance/QoS modeling.
+    """Like `.approxapp.ApproxApp`, but uses a model for QoS/cost measurement.
 
-    It's invalid to inherit from this class without also implementing at least 1 interface
-    provided in this set of API;
-    for non-modeling application, inherit from `ApproxApp` instead.
+    To use this class, inherit from it and implement `get_models`,
+    `empirical_measure_qos_cost`, and `.approxapp.ApproxApp.name`.
+    (This class provides an implementation of `.approxapp.ApproxApp.measure_qos_cost`.)
+
+    :param op_knobs: a mapping from each operator (identified by str) to a list of applicable knobs.
+    :type op_knobs: Dict[str, List[ApproxKnob]]
+    :param target_device: the target device that this application should be tuned on.
+           See `.approxapp.ApproxApp` constructor.
+    :type target_device: Optional[str]
     """
 
-    def __init__(self, op_knobs: Dict[str, List[ApproxKnob]], tuning_device: str = None) -> None:
-        super().__init__(op_knobs, tuning_device)
+    def __init__(
+        self, op_knobs: Dict[str, List[ApproxKnob]], target_device: str = None
+    ) -> None:
+        super().__init__(op_knobs, target_device)
         models = self.get_models()
         self._name_to_model = {m.name: m for m in models}
         if len(self._name_to_model) != len(models):
             raise ValueError("Name conflict in models")
         self._cost_models = {
-            model.name: model for model in models if isinstance(model, IPerfModel)
+            model.name: model for model in models if isinstance(model, ICostModel)
         }
         self._qos_models = {
             model.name: model for model in models if isinstance(model, IQoSModel)
         }
 
     @abc.abstractmethod
-    def get_models(self) -> List[Union["IPerfModel", "IQoSModel"]]:
-        """Get QoS/Performance prediction models for this application."""
+    def get_models(self) -> List[Union["ICostModel", "IQoSModel"]]:
+        """A list of QoS/Cost prediction models for this application.
+
+        Cost models should inherit from `ICostModel`
+        while QoS models should inherit from `IQoSModel`.
+
+        :rtype: List[Union[ICostModel, IQoSModel]]
+        """
         pass
 
+    @abc.abstractmethod
     def empirical_measure_qos_cost(
         self, with_approxes: KnobsT, is_test: bool
     ) -> Tuple[float, float]:
-        """Measures QoS and performance by running the program with approximation.
+        """Empirically measures QoS and cost by actually
+        running the program with approximation (as opposed to using model).
 
-        An implementation is not necessary if empirical measurement is never intended.
+        :param with_approxes: The approximation configuration to measure QoS and cost for.
+        :param is_test: If True, uses a "test" dataset/mode that is held away from the tuner
+               during tuning.
         """
-        raise NotImplementedError()
 
     def measure_qos_cost(
         self,
         with_approxes: KnobsT,
         is_test: bool,
-        qos_model: str = "none",
-        cost_model: str = "none",
+        qos_model: Optional[str] = None,
+        cost_model: Optional[str] = None,
     ) -> Tuple[float, float]:
-        """We provide this with the right qos and cost function.
-
-        Empirical measurement will be called once if either `cost_model` or `qos_model`
-        is "none", otherwise only use model indicated by model name.
+        """Returns the QoS and cost (time, energy, ...) of a given configuration,
+        *potentially using models*.
+
+        If either of `cost_model` or `qos_model` is None,
+        this will perform empirical measurement once to get the one that is not using a model.
+        Otherwise, no empirical measurement will be used.
+
+        Note that when running on test set (``is_test == True``), no modeling is allowed
+        (this raises a `ValueError`).
+
+        :param with_approxes: The approximation configuration to measure QoS and cost for.
+        :param is_test: If True, uses a "test" dataset/mode that is held away from the tuner
+               during tuning; otherwise use "tune" dataset.
+        :param qos_model: The QoS model to use in this measurement, keyed by model's name
+               (See `IQoSModel.name`).
+        :param cost_model: The Cost model to use in this measurement, keyed by model's name
+               (See `ICostModel.name`).
         """
         # Testset measurement is always empirical
         if is_test:
+            if qos_model is not None or cost_model is not None:
+                raise ValueError("Test dataset measurement is always empirical")
             return self.empirical_measure_qos_cost(with_approxes, is_test)
         # Run empirical measurement once if either cost or qos needs it
         qos, cost = None, None
-        if qos_model == "none" or cost_model == "none":
+        if qos_model is None or cost_model is None:
             qos, cost = self.empirical_measure_qos_cost(with_approxes, is_test)
         # If we're asked to use some qos_model, overwrite `qos` value
-        # even if we already get it from empirical measure (i.e., even if cost_model == "none")
-        if qos_model != "none":
+        # even if we already get it from empirical measure (i.e., even if cost_model is None)
+        if qos_model is not None:
             if qos_model not in self._qos_models:
                 raise ValueError(
                     f'"{qos_model}" is an invalid value for qos_model '
                     f"(choose from {list(self._qos_models.keys())})"
                 )
             qos = self._qos_models[qos_model].measure_qos(with_approxes)
-        # Same goes for perf
-        if cost_model != "none":
+        # Same goes for cost
+        if cost_model is not None:
             if cost_model not in self._cost_models:
                 raise ValueError(
                     f'"{cost_model}" is an invalid value for cost_model '
@@ -91,14 +124,23 @@ class ModeledApp(ApproxApp, abc.ABC):
         return qos, cost
 
     def get_tuner(self) -> "ApproxModeledTuner":
+        """Sets up an ApproxTuner instance which the user can directly call
+        `tune()` on with opentuner parameters.
+
+        This returns an `ApproxModeledTuner`, different from `.approxapp.ApproxApp.get_tuner`
+        which returns an `ApproxTuner`.
+
+        :rtype: ApproxModeledTuner
+        """
+
         return ApproxModeledTuner(self)
 
     def init_model(self, model_name: str):
         self._name_to_model[model_name]._init()
 
 
-class IPerfModel(abc.ABC):
-    """Abstract base class for models that provide performance prediction."""
+class ICostModel(abc.ABC):
+    """Abstract base class for models that provide cost prediction."""
 
     def __init__(self) -> None:
         self._inited = False
@@ -111,7 +153,10 @@ class IPerfModel(abc.ABC):
 
     @abc.abstractmethod
     def measure_cost(self, with_approxes: KnobsT) -> float:
-        """Predict the performance of application."""
+        """Predict the cost of application.
+
+        :param with_approxes: The configuration to predict cost for.
+        """
         pass
 
     def _init(self):
@@ -133,7 +178,10 @@ class IQoSModel(abc.ABC):
 
     @abc.abstractmethod
     def measure_qos(self, with_approxes: KnobsT) -> float:
-        """Predict the qos of application."""
+        """Predict the QoS of application.
+
+        :param with_approxes: The configuration to predict QoS for.
+        """
         pass
 
     def _init(self):
@@ -141,8 +189,16 @@ class IQoSModel(abc.ABC):
         self._inited = True
 
 
-class LinearPerfModel(IPerfModel):
-    """Weighted linear performance predictor based on cost of each operator."""
+class LinearCostModel(ICostModel):
+    """Weighted linear cost predictor based on cost of each operator.
+
+    This predictor compute a weighted sum over
+    the cost of each operator and the speedup of each knob on that operator.
+
+    :param app: The `ModeledApp` to predict cost for.
+    :param op_costs: A mapping from operator name to its (baseline) cost.
+    :param knob_speedups: A mapping from knob name to its (expected) speedup.
+    """
 
     def __init__(
         self,
@@ -167,7 +223,6 @@ class LinearPerfModel(IPerfModel):
         return "cost_linear"
 
     def measure_cost(self, with_approxes: KnobsT) -> float:
-        """We implement this using a weighted linear performance model."""
         with_approxes = self.app.add_baseline_to_knobs(with_approxes)
         return float(
             sum(self.cost_df.loc[layer, knob] for layer, knob in with_approxes.items())
@@ -177,13 +232,17 @@ class LinearPerfModel(IPerfModel):
 class QoSModelP1(IQoSModel):
     """QoS model `P1` in ApproxTuner.
 
-    tensor_output_getter: Run the tensor-based application with config `with_approxes` applied,
-        and return a single tensor result.
+    :param app: The `ModeledApp` to predict QoS for.
+    :param tensor_output_getter: A function that can run the
+           tensor-based application with a config and return a single tensor result.
 
-        Note that while we require the return value to be a PyTorch tensor,
-        user is free to implement this on non-PyTorch applications.
+           Note that here we require the return value to be a PyTorch tensor.
 
-    qos_metric: Compute a Quality of Service level from the tensor output of application
+    :param qos_metric: A function that compute a QoS level from the return value
+           of `tensor_output_getter`.
+    :param storage: A file of PyTorch format to store this model into, if the file doesn't exist,
+           or load the model from if the file exists.
+           If not given, the model will not be stored.
     """
 
     def __init__(
@@ -210,7 +269,6 @@ class QoSModelP1(IQoSModel):
         return "qos_p1"
 
     def measure_qos(self, with_approxes: KnobsT) -> float:
-        """Implementation of model."""
         assert self.baseline_tensor is not None
         with_approxes = self.app.add_baseline_to_knobs(with_approxes)
         delta_sum = sum(
@@ -220,45 +278,31 @@ class QoSModelP1(IQoSModel):
         return float(self.qos_metric(ret))
 
     def _init(self):
+        if self.storage and self.storage.is_file():
+            self.delta_tensors, self.baseline_tensor = torch.load(self.storage)
         dt = self.delta_tensors
         btensor = self.baseline_tensor = self.output_f({})
-        if self.storage and self.storage.is_file():
-            for op, knob, delta_tensor in self._load(self.storage):
-                dt[op][knob] = delta_tensor
+        updated = False
         for op, knob in barred_ravel_knobs(self.app):
             if dt[op][knob] is not None:
                 continue
+            updated = True
             delta_tensor = self.output_f({op: knob}) - btensor
             dt[op][knob] = delta_tensor
-            self._try_append_save(self.storage, op, knob, delta_tensor)
+        if self.storage and updated:
+            os.makedirs(self.storage.parent, exist_ok=True)
+            torch.save((dt, btensor), self.storage)
         super()._init()
 
-    def _load(self, path: Path) -> Iterator[Tuple[str, str, torch.Tensor]]:
-        msg_logger.info(f"Model {self.name} found saved model at {path}")
-        with path.open("rb") as f:
-            while True:
-                try:
-                    op_name, knob_name, tensor = pickle.load(f)
-                    yield op_name, knob_name, tensor
-                except EOFError:
-                    return
-
-    @staticmethod
-    def _try_append_save(
-        path: Optional[Path], op_name: str, knob_name: str, tensor: torch.Tensor
-    ):
-        import os
-
-        if not path:
-            return
-        if not path.parent.is_dir():
-            os.makedirs(path.parent)
-        with path.open("ab") as f:
-            pickle.dump((op_name, knob_name, tensor), f)
-
 
 class QoSModelP2(IQoSModel):
-    """QoS model `P2` in ApproxTuner."""
+    """QoS model `P1` in ApproxTuner.
+
+    :param app: The `ModeledApp` to predict QoS for.
+    :param storage: A JSON file to store this model into, if the file doesn't exist,
+           or load the model from if the file exists.
+           If not given, the model will not be stored.
+    """
 
     def __init__(self, app: ModeledApp, storage: PathLike = None) -> None:
         super().__init__()
@@ -326,8 +370,6 @@ class QoSModelP2(IQoSModel):
         self.baseline_qos = float(data["bqos"])
 
     def _save(self, path: Path):
-        import os
-
         if not path.parent.is_dir():
             os.makedirs(path.parent)
         with path.open("w") as f:
@@ -342,6 +384,22 @@ class QoSModelP2(IQoSModel):
 
 
 class ValConfig(Config):
+    """An `.approxapp.Config` that also optionally stores the "validation QoS".
+
+    Validation QoS is the empirically measured QoS in the "validation phase"
+    at the end of tuning (see `ApproxModeledTuner.tune`).
+
+    :param qos: The maybe-predicted QoS of this config.
+           (If tuning is empirical then this is empirical, not predicted, QoS.)
+           This is in contrast to `Config.qos`, which is always empirically measured on tuning dataset.
+    :param cost: The *relative* cost (time, energy, etc.) of this config
+           compared to the baseline config. This is essentially :math:`1 / speedup`.
+    :param knobs: The op-knob mapping in this configuration.
+    :param test_qos: The empirically measured QoS of this config on test mode.
+    :param validated_qos: The empirically measured QoS of this config on tuning mode,
+           in the validation phase. See `ApproxModeledTuner.tune`.
+    """
+
     def __init__(
         self,
         qos: float,
@@ -366,25 +424,51 @@ class ApproxModeledTuner(ApproxTuner):
         take_best_n: Optional[int] = None,
         test_configs: bool = True,
         validate_configs: Optional[bool] = None,
-        cost_model: str = "none",
-        qos_model: str = "none",
+        cost_model: Optional[str] = None,
+        qos_model: Optional[str] = None,
     ) -> List[ValConfig]:
+        """Runs a tuning session.
+
+        :param max_iter: Number of iterations to use in tuning.
+        :param qos_tuner_threshold: The QoS threshold that the tuner should aim for.
+               QoS is assumed to be a higher-better quantity.
+               This should be slightly tighter than `qos_keep_threshold`
+               to account for extra error when running on test dataset.
+        :param qos_keep_threshold: The QoS threshold beyond which we will keep the configuration.
+               By default it is equal to `qos_keep_threshold`.
+        :param is_threshold_relative: If True, the actual thresholds are considered to be
+               ``baseline_qos - given_threshold``.
+               This applies to `qos_tuner_threshold` and `qos_keep_threshold`.
+        :param take_best_n: Take the best :math:`n` configurations after tuning.
+               "Best" is defined as the configurations closest to the pareto curve
+               of the QoS-cost tradeoff space.
+               If `take_best_n` is None, only the configurations strictly on the
+               pareto curve are taken.
+        :param test_configs: If True, runs the configs on the test dataset,
+               filter the taken configs by `qos_keep_threshold`,
+               and fill the `test_qos` field of `ValConfig`.
+        :param validate_configs: If True, runs a validation step that empirically measures
+               the QoS of configs, filter the taken configs by `qos_keep_threshold`,
+               and fill the `validated_qos` field of `ValConfig`.
+        :param cost_model: The cost model to use for this tuning session.
+        :param qos_model: The QoS model to use for this tuning session.
+               This and `cost_model` are relayed down the line to `ModeledApp.measure_qos_cost`.
+        """
+
         qos_desc = (
-            "no model for qos" if qos_model == "none" else f'qos model "{qos_model}"'
+            "no model for qos" if qos_model is None else f'qos model "{qos_model}"'
         )
         cost_desc = (
-            "no model for performance"
-            if cost_model == "none"
-            else f'performance model "{cost_model}"'
+            "no model for cost" if cost_model is None else f'cost model "{cost_model}"'
         )
         msg_logger.info("Starting tuning with %s and %s", qos_desc, cost_desc)
-        if qos_model != "none":
+        if qos_model is not None:
             msg_logger.info("Initializing qos model %s", qos_model)
             self.app.init_model(qos_model)
-        if cost_model != "none":
-            msg_logger.info("Initializing performance model %s", cost_model)
+        if cost_model is not None:
+            msg_logger.info("Initializing cost model %s", cost_model)
             self.app.init_model(cost_model)
-        ret = super().tune(
+        super().tune(
             max_iter=max_iter,
             qos_tuner_threshold=qos_tuner_threshold,
             qos_keep_threshold=qos_keep_threshold,
@@ -393,93 +477,136 @@ class ApproxModeledTuner(ApproxTuner):
             test_configs=False,  # Test configs below by ourselves
             app_kwargs={"cost_model": cost_model, "qos_model": qos_model},
         )
-        if validate_configs is None and qos_model != "none":
+        if validate_configs is None and qos_model is not None:
             msg_logger.info(
                 'Validating configurations due to using qos model "%s"', qos_model
             )
-            self.best_configs = self._update_configs(self.best_configs, False)
+            self._update_configs_(self.best_configs_prefilter, False)
         elif validate_configs:
             msg_logger.info("Validating configurations as user requested")
-            self.best_configs = self._update_configs(self.best_configs, False)
+            self._update_configs_(self.best_configs_prefilter, False)
         if test_configs:
             msg_logger.info("Calibrating configurations on test inputs")
-            self.best_configs = self._update_configs(self.best_configs, True)
-        return ret
-
-    def _update_configs(self, configs: List[ValConfig], test_mode: bool):
-        from copy import deepcopy
+            self._update_configs_(self.best_configs_prefilter, True)
+        self.best_configs = self._filter_configs(self.best_configs_prefilter)
+        return self.best_configs
 
+    def _update_configs_(self, configs: List[ValConfig], test_mode: bool):
         from tqdm import tqdm
 
-        assert self.keep_threshold is not None
         if not configs:
             msg_logger.info("No configurations found.")
-            return []
+            return
         ret_configs = []
         total_error = 0
         for cfg in tqdm(configs, leave=False):
-            cfg = deepcopy(cfg)
             qos, _ = self.app.measure_qos_cost(cfg.knobs, test_mode)
             if test_mode:
                 assert cfg.test_qos is None
                 cfg.test_qos = qos
-                msg_logger.debug(f"Calibration: {cfg.qos} (mean) -> {qos} (mean)")
+                msg_logger.debug(f"Test: {cfg.qos} (mean) -> {qos} (mean)")
             else:
                 assert cfg.validated_qos is None
                 cfg.validated_qos = qos
                 msg_logger.debug(f"Validation: {cfg.qos} (mean) -> {qos} (mean)")
             total_error += abs(cfg.qos - qos)
-            if qos > self.keep_threshold:
-                ret_configs.append(cfg)
-            else:
-                msg_logger.debug("Config removed")
         mean_err = total_error / len(configs)
-        if test_mode:
-            msg_logger.info("QoS mean abs difference of calibration: %f", mean_err)
-        else:
-            msg_logger.info("QoS mean abs difference of validation: %f", mean_err)
-        msg_logger.info("%d of %d configs remain", len(ret_configs), len(configs))
+        dataset_name = "test" if test_mode else "tune"
+        msg_logger.info(
+            "QoS changed by %f on %s dataset (mean abs diff)", mean_err, dataset_name
+        )
+
+    def _filter_configs(self, configs: List[ValConfig]):
+        ret_configs = [
+            cfg
+            for cfg in configs
+            if (not cfg.validated_qos or cfg.validated_qos >= self.tune_keep_threshold)
+            and cfg.test_qos >= self.test_keep_threshold
+        ]
+        msg_logger.info(
+            "%d of %d configs remain after validation and testing",
+            len(ret_configs),
+            len(configs),
+        )
         return ret_configs
 
     def plot_configs(
-        self, show_qos_loss: bool = False, connect_best_points: bool = False
+        self,
+        show_qos_loss: bool = False,
+        connect_best_points: bool = False,
     ) -> plt.Figure:
+        """Plots 1 to 3 QoS-vs-speedup scatter plot of configurations.
+
+        All kept configurations and all "best" configurations (before test-set filtering if any)
+        are always plotted in the first subplot.
+
+        If there was a validation phase during tuning,
+        the second subplot contains the "best" configurations plotted twice,
+        with predicted and empirically measured QoS (on tune set) respectively.
+
+        If both validation and test-set filtering was used,
+        the last subplot contains the "best" configurations
+        with *empirically measured* tune-set and test-set QoS loss respectively.
+
+        :param show_qos_loss: If True, uses the loss of QoS (compared to the baseline)
+               instead of the absolute QoS in the first 2 graphs.
+               *This does not apply to the third graph* if it exists,
+               which always use QoS loss for ease of comparison.
+        """
+
         if not self.tuned:
             raise RuntimeError(
                 f"No tuning session has been run; call self.tune() first."
             )
+        dot_format = "-o" if connect_best_points else "o"
+        # Without `ax` argument, this function returns if we can
+        # do the second/third plot or not.
+        # plot_test_phase returns True implies plot_validation_phase returning True.
+        val_phase = self.plot_validation_phase()
+        test_phase = self.plot_test_phase()
+        n_subplots = 1 + int(val_phase) + int(test_phase)
+        fig, axes = plt.subplots(
+            1, n_subplots, squeeze=False, figsize=(6 + 4 * n_subplots, 6), dpi=300
+        )
+
+        i = 1
+        self.plot_kept_and_best(axes[0, 0], show_qos_loss)
+        if val_phase:
+            ax = axes[0, i]
+            self.plot_validation_phase(ax, show_qos_loss, dot_format)
+            i += 1
+        if test_phase:
+            ax = axes[0, i]
+            tuneset_key = "validated_qos" if val_phase else "qos"
+            self.plot_test_phase(ax, dot_format, tuneset_key)
+            i += 1
+        fig.tight_layout()
+        return fig
 
-        # For empirical tuning there's no `validated_qos`.
-        # We first check, and if that's the case, we pass on to our parent class instead.
-        val_qos_nones = [conf.validated_qos is None for conf in self.kept_configs]
-        if any(val_qos_nones):
-            assert all(val_qos_nones)
-            return super().plot_configs(show_qos_loss, connect_best_points, False)
-
-        def get_points(confs, validated):
-            def qos_speedup(conf):
-                return conf.validated_qos if validated else conf.qos, conf.speedup
-
-            sorted_points = np.array(
-                sorted([qos_speedup(c) for c in confs], key=lambda p: p[0])
-            ).T
-            if show_qos_loss:
-                sorted_points[0] = self.baseline_qos - sorted_points[0]
-            return sorted_points
-
-        fig, ax = plt.subplots()
-        kept_confs = get_points(self.kept_configs, False)
-        best_confs = get_points(self.best_configs, False)
-        best_confs_val = get_points(self.best_configs, True)
-        ax.plot(kept_confs[0], kept_confs[1], "o", label="valid")
-        mode = "-o" if connect_best_points else "o"
-        ax.plot(best_confs[0], best_confs[1], mode, label="best")
-        mode = "-o" if connect_best_points else "o"
-        ax.plot(best_confs_val[0], best_confs_val[1], mode, label="best_validated")
-        ax.set_xlabel("QoS Loss" if show_qos_loss else "QoS")
+    def plot_validation_phase(
+        self, ax: plt.Axes = None, show_qos_loss: bool = False, dot_format: str = "o"
+    ):
+        configs = self.best_configs_prefilter
+        validated = [conf.validated_qos is not None for conf in configs]
+        can_plot = all(validated)
+        if not ax:
+            return can_plot
+        assert can_plot
+        pred_x, pred_y = self._config_qos_speedups(configs, "qos", show_qos_loss, False)
+        measured_x, measured_y = self._config_qos_speedups(
+            configs, "validated_qos", show_qos_loss, False
+        )
+        ax.plot(pred_x, pred_y, dot_format, label="Predicted QoS")
+        ax.plot(measured_x, measured_y, dot_format, label="Validated QoS")
+        self._set_xy_limit(ax, show_qos_loss)
+        if show_qos_loss:
+            ax.set_xlabel("QoS Loss (tune dataset)")
+            rthres = self.baseline_tune_qos - self.tune_keep_threshold
+            self._draw_qos_line(ax, rthres, f"Relative threshold: {rthres:.2f}")
+        else:
+            ax.set_xlabel("QoS (tune dataset)")
         ax.set_ylabel("Speedup (x)")
         ax.legend()
-        return fig
 
     @classmethod
     def _get_config_class(cls) -> Type[Config]:
diff --git a/predtuner/torchapp.py b/predtuner/torchapp.py
index 00a0376da0f7b24db45a615c13f8d0c2c740040c..3e0678d0cc5d66969bce31926fda4fd1a67bf4fa 100644
--- a/predtuner/torchapp.py
+++ b/predtuner/torchapp.py
@@ -10,9 +10,9 @@ from torch.utils.data.dataloader import DataLoader
 from ._logging import PathLike
 from .approxapp import ApproxKnob, KnobsT
 from .modeledapp import (
-    IPerfModel,
+    ICostModel,
     IQoSModel,
-    LinearPerfModel,
+    LinearCostModel,
     ModeledApp,
     QoSModelP1,
     QoSModelP2,
@@ -34,15 +34,27 @@ class TorchApproxKnob(ApproxKnob):
     @property
     @abc.abstractmethod
     def expected_speedup(self) -> float:
+        """The speedup this knob is expected to provide. Used for cost prediction."""
         pass
 
     @abc.abstractmethod
     def is_applicable(self, op: Module) -> bool:
+        """Returns True if this knob can be applied to this Module.
+        
+        :param op: the module to check availability for.
+        :type op: torch.nn.Module
+        :rtype: torch.nn.Module
+        """
         pass
 
     @abc.abstractmethod
     def apply(self, op: Module) -> Module:
-        """Applies knob to `module` and returns an approximated `module`."""
+        """Applies knob to a Module and returns an approximated Module.
+        
+        :param op: the module to apply approximation on.
+        :type op: torch.nn.Module
+        :rtype: torch.nn.Module
+        """
         pass
 
 
@@ -53,40 +65,35 @@ class TorchApp(ModeledApp, abc.ABC):
     r"""Adaptor for approximable PyTorch Modules with tensor output.
 
     A TorchApp stores the PyTorch Module, datasets for tuning and calibration,
-    set of available TorchApproxKnob each of which may be applied to some layer in the Module,
+    set of available `TorchApproxKnob` each of which may be applied to some layer in the Module,
     and the quality of service (QoS) metric of application (e.g., accuracy).
-    It provides empirical tuning and predictive tuning capability (see `TorchApp.tune()`),
-
-    Parameters
-    ----------
-    app_name:
-        Name of the application, which is used as an identifier in tuning sessions, etc.
-    module:
-        The PyTorch module to tune.
-    tune_dataloader:
-        A dataset to use as inputs to module during tuning. (PyTorch DataLoader is conceptually
-        an enumerable, batched dataset.)
-    test_dataloader:
-        A input dataset used for QoS testing (see `test_config` parameter of `ApproxModeledTuner.tune`).
-    knobs:
-        A set of knobs to be considered. Each knob has an `is_applicable()` method
-        which is used to determine which layer it can apply to.
-    tensor_to_qos:
-        QoS metric function which computes QoS from the module's output.
-    combine_qos:
-        A function to combine each batch's QoS into one value.
-        When QoS is accuracy this will most likely be `mean()` (which is the default).
-    device:
-        The device to store module and perform inference on. By default is "cuda"
-        if CUDA is available, otherwise "cpu".
-    model_storage_folder:
-        A folder to store the serialized QoS models into.
-        `QoSModelP1` will be serialized into `model_storage_folder / "p1.pkl"`,
-        and `QoSModelP2` into `model_storage_folder / "p2.json"`.
-        See `QoSModelP1` and `QoSModelP2` for details.
-
-    Attributes
-    ----------
+    It provides empirical tuning and predictive tuning capability,
+    automatically supporting `.modeledapp.LinearCostModel`,
+    `.modeledapp.QoSModelP1`, and `.modeledapp.QoSModelP2`.
+
+    In contrast to `.approxapp.ApproxApp` and `.modeledapp.ModeledApp`,
+    there should be no need to inherit from `TorchApp` in most use cases.
+
+    :param app_name: Name of the application, which is used as an identifier in tuning sessions, etc.
+    :param module: The PyTorch module to tune.
+    :param tune_dataloader: A `torch.utils.data.Dataset` dataset to use as inputs to module during tuning.
+    :param test_dataloader: A `torch.utils.data.Dataset` dataset used for QoS testing
+           (see `test_configs` parameter of `ApproxModeledTuner.tune`).
+    :param knobs: A set of `TorchApproxKnob` to be considered. Each knob has an `is_applicable()` method
+           which is used to determine which layer it can apply to.
+           `.approxes.get_knobs_from_file` returns a set of builtin knobs that will exactly fit here.
+    :param tensor_to_qos: QoS metric function which computes QoS from the module's output.
+           `.torchutil.accuracy` computes the classification accuracy which can be applied here.
+    :param combine_qos: A function to combine each batch's QoS into one value.
+           When QoS is Classification Accuracy, this will most likely be `numpy.mean`
+           (which is the default value).
+    :param target_device: The target device that this application should be tuned on.
+    :param torch_device: The PyTorch device where the model inference is run on.
+           This device should be able to run the implementations of the knobs
+           available for this app on `target_device`.
+    :param model_storage_folder: A folder to store the serialized QoS models into.
+           `QoSModelP1` will be serialized into ``model_storage_folder / "p1.pkl"``,
+           and `QoSModelP2` into ``model_storage_folder / "p2.json"``.
     """
 
     def __init__(
@@ -98,7 +105,7 @@ class TorchApp(ModeledApp, abc.ABC):
         knobs: Set[TorchApproxKnob],
         tensor_to_qos: Callable[[torch.Tensor, Any], float],
         combine_qos: Callable[[np.ndarray], float] = np.mean,
-        tuning_device: str = None,
+        target_device: str = None,
         torch_device: Union[torch.device, str] = _default_device,
         model_storage_folder: Optional[PathLike] = None,
     ) -> None:
@@ -107,7 +114,7 @@ class TorchApp(ModeledApp, abc.ABC):
         self.tune_loader = tune_dataloader
         self.test_loader = test_dataloader
         self.name_to_knob = {
-            k.name: k for k in self._check_and_filter_knob(knobs, tuning_device)
+            k.name: k for k in self._check_and_filter_knob(knobs, target_device)
         }
         self.tensor_to_qos = tensor_to_qos
         self.combine_qos = combine_qos
@@ -132,17 +139,17 @@ class TorchApp(ModeledApp, abc.ABC):
             self._op_costs[op_name] = summary.loc[op_name, "flops"]
 
         # Init parent class last
-        super().__init__(op_knobs, tuning_device)
+        super().__init__(op_knobs, target_device)
 
     @property
     def name(self) -> str:
         """Returns the name of application."""
         return self.app_name
 
-    def get_models(self) -> List[Union[IPerfModel, IQoSModel]]:
+    def get_models(self) -> List[Union[ICostModel, IQoSModel]]:
         """Returns a list of predictive tuning models.
 
-        TorchApp in particular derives 1 performance model (LinearPerfModel)
+        TorchApp in particular derives 1 performance model (LinearCostModel)
         and 2 QoS models (QoSModelP1, QoSModelP2) automatically.
         """
 
@@ -162,7 +169,7 @@ class TorchApp(ModeledApp, abc.ABC):
         p1_storage = self.model_storage / "p1.pkl" if self.model_storage else None
         p2_storage = self.model_storage / "p2.json" if self.model_storage else None
         return [
-            LinearPerfModel(self, self._op_costs, self._knob_speedups),
+            LinearCostModel(self, self._op_costs, self._knob_speedups),
             QoSModelP1(
                 self, self._get_raw_output_valset, batched_valset_qos, p1_storage
             ),
diff --git a/predtuner/torchutil/common_qos.py b/predtuner/torchutil/common_qos.py
index a6a19f6625453bcd381cdd392f1c8a0206c206f8..b9d8dbf5a0945f274a1fa4b50724fcde5887d45a 100644
--- a/predtuner/torchutil/common_qos.py
+++ b/predtuner/torchutil/common_qos.py
@@ -2,6 +2,12 @@ from torch import Tensor
 
 
 def accuracy(output: Tensor, target: Tensor) -> float:
+    """The "classification accuracy" metric (return value is between 0 and 100).
+    
+    :param output: Probability distribution output from the model.
+    :param target: A 1d-tensor of labels, one for each input image, from the dataset.
+    """
+
     _, pred_labels = output.max(1)
     n_correct = (pred_labels == target).sum().item()
     return n_correct / len(output) * 100
diff --git a/setup.py b/setup.py
index 3377b9ce5084c3f82ba1d8bf28903eb462ec44ae..512eb5a5f75141eba5ea7a0e057c65771fd741ec 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 import setuptools
 
-with open("README.md", "r", encoding="utf-8") as fh:
+with open("README.rst", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 
 setuptools.setup(
@@ -10,14 +10,18 @@ setuptools.setup(
     author_email="yifanz16@illinois.edu",
     description="A package for predictive and empirical approximation autotuning",
     long_description=long_description,
-    long_description_content_type="text/markdown",
+    long_description_content_type="text/x-rst",
     url="https://github.com/Evan-Zhao/predictive-tuner",
-    packages=["predtuner"],
+    packages=setuptools.find_packages(),
+    package_data={
+        "predtuner.approxes": ["default_approx_params.json"]
+    },
+    include_package_data=True,
     install_requires=[
         "matplotlib>=3.3",
         "networkx>=2.5",
-        "torch==1.7.1",
-        "torchvision==0.8.2",
+        "torch>=1.5.1",
+        "torchvision>=0.6",
         "tqdm>=4.50",
         "pandas>=1.1",
         "jsonpickle>=1.5",
diff --git a/test/test_torchapp.py b/test/test_torchapp.py
index f43fdb45e5b574c00cdd748fe506ee50e2c1a5b9..e4b7359fbd213635088559c966cacd634785d8e9 100644
--- a/test/test_torchapp.py
+++ b/test/test_torchapp.py
@@ -41,7 +41,7 @@ class TestTorchAppTuning(TorchAppSetUp):
         self.assertEqual(self.app.baseline_knob.name, "11")
 
     def test_cpu_knobs(self):
-        app = TorchApp(**self.app_args, tuning_device="cpu")
+        app = TorchApp(**self.app_args, target_device="cpu")
         n_knobs = {op: len(ks) for op, ks in app.op_knobs.items()}
         for op_name, op in app.midx.name_to_module.items():
             nknob = 28 if isinstance(op, Conv2d) else 1
@@ -49,7 +49,7 @@ class TestTorchAppTuning(TorchAppSetUp):
         self.assertEqual(app.baseline_knob.name, "11")
 
     def test_gpu_knobs(self):
-        app = TorchApp(**self.app_args, tuning_device="gpu")
+        app = TorchApp(**self.app_args, target_device="gpu")
         n_knobs = {op: len(ks) for op, ks in app.op_knobs.items()}
         for op_name, op in app.midx.name_to_module.items():
             nknob = 28 if isinstance(op, Conv2d) else 1