diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 75381aa720fdee717d20db115ecbc74e73deff4c..867707bf1fdcb9178f0320e757b3e8a3ca09b1e7 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -1,2 +1,133 @@
 Getting Started
 ===================
+
+This guide can help you start working with PredTuner.
+
+Installation
+------------
+
+Install PredTuner from source using `pip`:
+
+.. code-block:: shell
+
+   pip install -e .
+
+PredTuner will also be available on PyPi in the future after we publish the first release.
+
+Tuning a PyTorch DNN
+--------------------
+
+PredTuner can tune any user-defined application,
+but it is optimized for tuning DNN applications defined in PyTorch.
+
+We will use models predefined in PredTuner for demonstration purposes.
+Download pretrained VGG16 model parameters and CIFAR10 dataset from `here
+<https://drive.google.com/file/d/1Z84z-nsv_nbrr8t9i28UoxSJg-Sd_Ddu/view?usp=sharing>`_.
+After extraction, there should be a :code:`model_params/` folder in current directory.
+
+Load the tuning and test subsets of CIFAR10 dataset, and create a pretrained VGG16 model:
+
+.. code-block:: python
+  
+  from pathlib import Path
+  import predtuner as pt
+  from predtuner.model_zoo import CIFAR, VGG16Cifar10
+
+  prefix = Path("model_params/vgg16_cifar10")
+  tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin")
+  tune_loader = DataLoader(tune_set, batch_size=500)
+  test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin")
+  test_loader = DataLoader(test_set, batch_size=500)
+
+  module = VGG16Cifar10()
+  module.load_state_dict(torch.load("model_params/vgg16_cifar10.pth.tar"))
+
+PredTuner provides a logging mechanism.
+While not required, it's recommended that you set up the logger output into a file:
+
+.. code-block:: python
+
+  msg_logger = pt.config_pylogger(output_dir="vgg16_cifar10/", verbose=True)
+
+For each tuning task, both a tuning dataset and a test dataset is required.
+The tuning dataset is used to evaluate the accuracy of application in the autotuning stage,
+while the test dataset is used to evaluate configurations found in autotuning.
+This is similar to the split between training and validation set in machine learning tasks.
+In this case, both tuning and test datasets contain 5000 images.
+
+Create an instance of :code:`TorchApp` for tuning PyTorch DNN:
+
+.. code-block:: python
+
+  app = pt.TorchApp(
+    "TestTorchApp",  # Application name -- can be anything
+    module,
+    tune_loader,
+    test_loader,
+    knobs=pt.get_knobs_from_file(),
+    tensor_to_qos=pt.accuracy,
+    model_storage_folder="vgg16_cifar10/",
+  )
+
+PredTuner provides :code:`TorchApp`, which is specialized for the use scenario of tuning PyTorch DNNs.
+In addition, two more functions from PredTuner are used:
+
+:code:`pt.accuracy` is the *classification accuracy* metric,
+which receives the probability distribution output from the VGG16 model,
+compare it to the groundtruth in the dataset,
+and returns a scalar between 0 and 100 for the classification accuracy
+
+:code:`pt.get_knobs_from_file()` returns a set of approximations preloaded in PredTuner,
+which are applied to :code:`torch.nn.Conv2d` layers.
+See ??? for these approximations and how to define your own approximations.
+
+Now we can obtain a tuner object from the application and start tuning.
+We will keep configurations that don't exceed 3% loss of accuracy,
+but encourage the tuner to find configurations with loss of accuracy below 2.1%.
+
+.. code-block:: python
+
+  tuner = app.get_tuner()
+  tuner.tune(
+      max_iter=100,
+      qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
+      qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the thresholds
+      is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
+      perf_model="perf_linear",  # Use linear performance predictor
+  )
+
+:code:`max_iter` defines the number of iterations to use in autotuning.
+100 iterations is for demonstration; in practice,
+at least 10000 iterations are necessary on VGG16-sized models to converge to a set of good configurations.
+
+Saving Tuning Results
+---------------------
+
+Now the :code:`tuner` object holds the tuning results,
+we can export it into a json file,
+and visualize all configurations in a figure:
+
+.. code-block:: python
+
+  tuner.dump_configs("vgg16_cifar10/configs.json", best_only=False)
+  fig = tuner.plot_configs(show_qos_loss=True)
+  fig.savefig("vgg16_cifar10/configs.png")
+
+PredTuner will also automatically mark out `Pareto-optimal
+<https://en.wikipedia.org/wiki/Pareto_efficiency>`_
+configurations.
+These are called "best" configurations (:code:`tuner.best_configs`),
+in contrast to "valid" configurations which are the configurations that satisfy our accuracy requirements
+(:code:`tuner.kept_configs`).
+
+Within 100 iterations, PredTuner should find 30~50 valid configurations.
+The generated figure should look like this:
+
+.. image:: tuning_result.png
+
+
+Loading Tuning Results
+----------------------
+
+TODO: TODO
+
diff --git a/doc/index.rst b/doc/index.rst
index 6866a98b3d779063329219cd2c49f8b2c6feb41d..b79d28a42bd60549b7e2921b29854948e0aece48 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -9,27 +9,21 @@ PredTuner performs autotuning on approximation choices for a program
 using an error-predictive proxy instead of executing the program,
 to greatly speedup autotuning while getting results of comparable quality.
 
-PredTuner is a contribution of [ApproxTuner]
-(https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations).
-
-Short-term Goals
-- Measure accuracy impact of approximations
-- Obtain a tuned, approximated CNN in <5 lines of code
-- Easy to manage multiple approximation configs
-- Easy to load and manage prior tuning results
-- Flexible retraining support
-Possible Long-term Goals
-- High-performance implementations of approximate layers
-- Allow users to register their own approximations
-- Support for other frameworks: TF, ONNX, JAX
+PredTuner is a main component of `ApproxTuner
+<https://ppopp21.sigplan.org/details/PPoPP-2021-main-conference/41/ApproxTuner-A-Compiler-and-Runtime-System-for-Adaptive-Approximations>`_.
 
-Documentation
--------------
 
-.. only:: html
+Solution for Efficient Approximation Autotuning
+-----------------------------------------------
+
+- Start a tuning session in 10 lines of code
+- Deep integration with PyTorch for DNN supports
+- Multiple levels of APIs for generality and ease-of-use
+- Effective accuracy prediction models
+- Easily store and visualize tuning results in many formats
 
-    :Release: |version|
-    :Date: |today|
+Documentation
+-------------
 
 .. toctree::
    :maxdepth: 1
@@ -41,6 +35,3 @@ Indices and tables
 ------------------
 
 * :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
-* :ref:`glossary`
diff --git a/doc/tuning_result.png b/doc/tuning_result.png
new file mode 100644
index 0000000000000000000000000000000000000000..d3854041d97ab51be43aa08e563f1850bba19f48
Binary files /dev/null and b/doc/tuning_result.png differ
diff --git a/examples/tune_vgg16_cifar10.py b/examples/tune_vgg16_cifar10.py
index eca7082b92b6f80ea674c5bffec855d37a35f9cf..9530bc0c66a38d2607a120dab7352d0988a669ce 100644
--- a/examples/tune_vgg16_cifar10.py
+++ b/examples/tune_vgg16_cifar10.py
@@ -17,9 +17,9 @@ msg_logger = config_pylogger(output_dir="/tmp", verbose=True)
 # TODO: you should use all (5000) images for actual tuning.
 prefix = Path("model_params/vgg16_cifar10")
 tune_set = CIFAR.from_file(prefix / "tune_input.bin", prefix / "tune_labels.bin")
-tune_loader = DataLoader(Subset(tune_set, range(500)), batch_size=500)
+tune_loader = DataLoader(tune_set, batch_size=500)
 test_set = CIFAR.from_file(prefix / "test_input.bin", prefix / "test_labels.bin")
-test_loader = DataLoader(Subset(test_set, range(500)), batch_size=500)
+test_loader = DataLoader(test_set, batch_size=500)
 
 # Load checkpoint for VGG16 (CIFAR10)
 module = VGG16Cifar10()
@@ -38,17 +38,17 @@ app = TorchApp(
     model_storage_folder="tuner_results/vgg16_cifar10",
 )
 # This is how to measure baseline accuracy -- {} means no approximation
-baseline, _ = app.measure_qos_perf({}, False)
+baseline, _ = app.measure_qos_cost({}, False)
 # Get a tuner object and start tuning!
 tuner = app.get_tuner()
 tuner.tune(
-    max_iter=100,  # TODO: In practice, use at least 5000, or 10000
+    max_iter=500,  # TODO: In practice, use at least 5000, or 10000
     qos_tuner_threshold=2.1,  # QoS threshold to guide tuner into
     qos_keep_threshold=3.0,  # QoS threshold for which we actually keep the thresholds
     is_threshold_relative=True,  # Thresholds are relative to baseline -- baseline_acc - 2.1
-    take_best_n=50,  # Take 50 "best" configs
-    perf_model="perf_linear",  # Use linear performance predictor
-    qos_model="qos_p1",  # Use P1 QoS predictor
+    cost_model="cost_linear",  # Use linear performance predictor
 )
 # Save configs here when you're done
 tuner.dump_configs("tuner_results/vgg16_cifar10_configs.json")
+fig = tuner.plot_configs(show_qos_loss=True)
+fig.savefig("tuner_results/vgg16_cifar10_configs.png")
\ No newline at end of file
diff --git a/predtuner/approxapp.py b/predtuner/approxapp.py
index a6513cf19dea61fa17d6e541d8fa56b568b1c336..b34b1a1aa13b83c610b86506be859d9c6294e9a5 100644
--- a/predtuner/approxapp.py
+++ b/predtuner/approxapp.py
@@ -222,7 +222,7 @@ class ApproxTuner(Generic[T]):
         taken_idx = is_pareto_efficient(points, take_n=n)
         return [configs[i] for i in taken_idx]
 
-    def dump_configs(self, filepath: PathLike):
+    def dump_configs(self, filepath: PathLike, best_only: bool = True):
         import os
 
         from jsonpickle import encode
@@ -233,8 +233,9 @@ class ApproxTuner(Generic[T]):
             )
         filepath = Path(filepath)
         os.makedirs(filepath.parent, exist_ok=True)
+        confs = self.best_configs if best_only else self.kept_configs
         with filepath.open("w") as f:
-            f.write(encode(self.best_configs, indent=2))
+            f.write(encode(confs, indent=2))
 
     def plot_configs(
         self, show_qos_loss: bool = False, connect_best_points: bool = False