diff --git a/README.md b/README.md index 0acb8d03b59c197e4e5987fc7fd10ad31f3b25ba..d29e34bdac37cea6c3e4ca16cb7413f908a2b38f 100755 --- a/README.md +++ b/README.md @@ -163,7 +163,6 @@ $ source env/bin/activate ``` ### Install the package -If you do not use CUDA 9 in your environment, please refer to [Pytorch website](https://pytorch.org/get-started/locally/) to install the compatible build of Pytorch 1.1 and torchvision 0.3, before installing the package. Finally, install the Distiller package and its dependencies using ```pip3```: ``` @@ -172,7 +171,11 @@ $ pip3 install -e . ``` This installs Distiller in "development mode", meaning any changes made in the code are reflected in the environment without re-running the install command (so no need to re-install after pulling changes from the Git repository). -PyTorch is included in the ```requirements.txt``` file, and will currently download PyTorch version 1.1.0 for CUDA 9.0. This is the setup we've used for testing Distiller. +### Required PyTorch Version + +Distiller is tested using the default installation of PyTorch 1.3.1, which uses CUDA 10.1. We use TorchVision version 0.4.2. These are included in Distiller's `requirements.txt` and will be automatically installed when installing the Distiller package as listed above. + +If you do not use CUDA 10.1 in your environment, please refer to [PyTorch website](https://pytorch.org/get-started/locally/) to install the compatible build of PyTorch 1.3.1 and torchvision 0.4.2. ## Getting Started diff --git a/distiller/summary_graph.py b/distiller/summary_graph.py index 6f6431e804803201a4b3b21539a2a0dc0f3e83c7..292065f217f02ed4691c6d370f2bc9c3715356dc 100755 --- a/distiller/summary_graph.py +++ b/distiller/summary_graph.py @@ -89,21 +89,12 @@ class SummaryGraph(object): self.dummy_input = dummy_input trace, _ = jit.get_trace_graph(model_clone, dummy_input, _force_outplace=True) - # As of PyTorch 1.1.0, ONNX trace optimization has two issues that result in incorrect scope names + # As of PyTorch 1.3.0, ONNX trace optimization has an issue that results in incorrect scope names # of nodes in the trace graph. # These can make it impossible, in some cases, to derive the connectivity of the model using the original # module names. So we try to detect these cases and apply workarounds - # Issue #1: - # Gemm ops (aka "Linear" / "addmm" / "FC") get the scope name of the last non-Gemm node - # that came before them. - # Note that if the node prior to the Gemm node isn't the result of a dedicated module call, - # then this issue doesn't occur. For simplicity we just track all Gemms. - # TODO: This should be fixed in PyTorch 1.2.0, revisit when it's released - aten_addmm_nodes_scope_names = [] - onnx_gemm_count = 0 - - # Issue #2: + # The issue is: # Dropout ops are removed by ONNX trace optimization. However, the op BEFORE the original dropout op # gets the scope name of the dropout op pre_dropout_nodes_scope_names = OrderedDict() @@ -118,8 +109,6 @@ class SummaryGraph(object): pre_dropout_nodes_scope_names[node.scopeName()] = prev_non_dropout_op.scopeName() else: prev_non_dropout_op = node - if kind == 'aten::addmm': - aten_addmm_nodes_scope_names.append(node.scopeName()) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. @@ -140,12 +129,6 @@ class SummaryGraph(object): new_op = self.__create_op(node) if apply_scope_name_workarounds: - # Here we apply the workaround to the Gemm nodes scope name issue mentioned above - if new_op['type'] == 'Gemm': - new_op['orig-name'] = aten_addmm_nodes_scope_names[onnx_gemm_count] - new_op['name'] = new_op['orig-name'] - onnx_gemm_count += 1 - # Here we apply the workaround to the issue of dropout op scope name overriding previous op's # scope name if new_op['name'] in pre_dropout_nodes_scope_names: @@ -202,11 +185,11 @@ class SummaryGraph(object): for input_ in node.inputs(): self.__add_input(new_op, input_) - self.edges.append(SummaryGraph.Edge(input_.uniqueName(), new_op['name'])) + self.edges.append(SummaryGraph.Edge(input_.debugName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) - self.edges.append(SummaryGraph.Edge(new_op['name'], output.uniqueName())) + self.edges.append(SummaryGraph.Edge(new_op['name'], output.debugName())) new_op['attrs'] = OrderedDict([(attr_name, node[attr_name]) for attr_name in node.attributeNames()]) @@ -273,16 +256,16 @@ class SummaryGraph(object): op['outputs'].append(param['id']) def __add_param(self, n): - if n.uniqueName() not in self.params: + if n.debugName() not in self.params: param = self.__tensor_desc(n) - self.params[n.uniqueName()] = param + self.params[n.debugName()] = param else: - param = self.params[n.uniqueName()] + param = self.params[n.debugName()] return param def __tensor_desc(self, n): tensor = OrderedDict() - tensor['id'] = n.uniqueName() + tensor['id'] = n.debugName() try: # try parsing the FM tensor type. For example: Float(1, 64, 8, 8) s = str(n.node()) diff --git a/requirements.txt b/requirements.txt index b6c115a4c037ddf8a468d57f1aad2f8de8734954..f564616f3d50ba64a21b36fdb00f88c60b80a243 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -torch==1.1.0 +torch==1.3.1 numpy>=1.16 -torchvision==0.3.0 +torchvision==0.4.2 scipy>=1.3.0 gitpython==2.1.11 torchnet==0.0.4 diff --git a/tests/full_flow_tests.py b/tests/full_flow_tests.py index 407fe2074cab943685a283802746e4ccf23c9eba..231a7b7a916d36e787c34fe5c2c878727d8ab2e6 100755 --- a/tests/full_flow_tests.py +++ b/tests/full_flow_tests.py @@ -152,23 +152,23 @@ TestConfig = namedtuple('TestConfig', ['args', 'dataset', 'checker_fn', 'checker test_configs = [ TestConfig('--arch resnet20_cifar_earlyexit --lr=0.3 --epochs=180 --earlyexit_thresholds 0.9 ' '--earlyexit_lossweights 0.3 --epochs 2 -p 50', DS_CIFAR, earlyexit_accuracy_checker, - [(99.115, 100.), - (17.235, 64.914), - (18.160, 65.310), - (17.04, 64.42)]), - TestConfig('--arch simplenet_cifar --epochs 2', DS_CIFAR, accuracy_checker, [44.460, 91.230]), + [(99.675, 100.), + (31.863, 84.985), + (36.040, 85.910), + (33.50, 85.980)]), + TestConfig('--arch simplenet_cifar --epochs 2', DS_CIFAR, accuracy_checker, [47.43, 92.51]), TestConfig('-a resnet20_cifar --resume {0} --quantize-eval --evaluate --qe-dynamic --qe-clip-acts avg --qe-no-clip-layers {1}'. format(os.path.join(examples_root, 'ssl', 'checkpoints', 'checkpoint_trained_dense.pth.tar'), 'fc'), - DS_CIFAR, accuracy_checker, [91.57, 99.62]), + DS_CIFAR, accuracy_checker, [91.47, 99.62]), TestConfig('-a preact_resnet20_cifar --epochs 2 --compress {0}'. format(os.path.join('full_flow_tests', 'preact_resnet20_cifar_pact_test.yaml')), - DS_CIFAR, accuracy_checker, [44.370, 89.640]), + DS_CIFAR, accuracy_checker, [47.47, 93.87]), TestConfig('-a resnet20_cifar --resume {0} --sense=filter --sense-range 0 0.10 0.05'. format(os.path.join(examples_root, 'ssl', 'checkpoints', 'checkpoint_trained_dense.pth.tar')), - DS_CIFAR, collateral_checker, [('sensitivity.csv', 3175), ('sensitivity.png', 96157)]), + DS_CIFAR, collateral_checker, [('sensitivity.csv', 3172), ('sensitivity.png', 96157)]), TestConfig('--arch simplenet_mnist --epochs 3 -p=50 --compress={0}'. format(os.path.join('full_flow_tests', 'simplenet_mnist_pruning.yaml')), - DS_MNIST, accuracy_checker, [98.78, 100.]), + DS_MNIST, accuracy_checker, [98.82, 100.00]), ] diff --git a/tests/test_learning_rate.py b/tests/test_learning_rate.py index dd239147be3ba3fa38a93daa4d2842cea4775d1d..f13950f157b57a5b0e142a2855378a344475d7a8 100644 --- a/tests/test_learning_rate.py +++ b/tests/test_learning_rate.py @@ -20,6 +20,7 @@ from torch.optim import Optimizer from distiller.learning_rate import MultiStepMultiGammaLR +@pytest.mark.filterwarnings('ignore:Detected call of') def test_multi_step_multi_gamma_lr(): dummy_tensor = torch.zeros(3, 3, 3, requires_grad=True) dummy_optimizer = Optimizer([dummy_tensor], {'lr': 0.1}) diff --git a/tests/test_post_train_quant.py b/tests/test_post_train_quant.py index 4718b15d4d74c2734b6abc685fa9a09e0b93c87e..a30524f53604889abc34fa7a9be793ce81b60fa8 100644 --- a/tests/test_post_train_quant.py +++ b/tests/test_post_train_quant.py @@ -727,6 +727,9 @@ class DummyWordLangModel(nn.Module): return self.rnn(self.embedding(x)) +# Same warning filters as in test_override_no_clip +@pytest.mark.filterwarnings('ignore:Iterating over a tensor might cause the trace to be incorrect') +@pytest.mark.filterwarnings('ignore:Converting a tensor to a Python index might cause the trace to be incorrect') def test_acts_quant_params_rnn(rnn_model): model = DummyWordLangModel(nn.Embedding(41, 20), rnn_model).cuda() stats = gen_stats_for_model(model) diff --git a/tests/test_sim_bn_fold.py b/tests/test_sim_bn_fold.py index 99fa241a61236c8a62a8ff4501e3d8d89b54f903..9557159e36ba7aa876a953b25a2c332337a88db4 100644 --- a/tests/test_sim_bn_fold.py +++ b/tests/test_sim_bn_fold.py @@ -6,7 +6,7 @@ from distiller.quantization.sim_bn_fold import SimulatedFoldedBatchNorm from copy import deepcopy import pytest -ATOL = 5e-5 +ATOL = 2e-4 RTOL = 1e-3 BATCH_SIZE = 32 LR = 1e-3 diff --git a/tests/test_summarygraph.py b/tests/test_summarygraph.py index 98feba80f7aefed82e3bd3e101e5266a44455d91..0d995c3a505b8a5710bb01958ee54b39a61937ed 100755 --- a/tests/test_summarygraph.py +++ b/tests/test_summarygraph.py @@ -72,7 +72,7 @@ def test_connectivity(parallel, denorm_names): assert len(op_names) == 80 edges = g.edges - assert edges[0].src == '0' and edges[0].dst == 'conv1' + assert edges[0].src == 'input.1' and edges[0].dst == 'conv1' # Test two sequential calls to predecessors (this was a bug once) preds = g.predecessors(g.find_op('bn1'), 1, denorm_names=denorm_names) @@ -90,7 +90,7 @@ def test_connectivity(parallel, denorm_names): preds = g.predecessors(g.find_op('bn1'), 10, denorm_names=denorm_names) assert preds == [] preds = g.predecessors(g.find_op('bn1'), 3, denorm_names=denorm_names) - assert preds == ['0', '1'] + assert preds == ['input.1', '1'] def test_layer_search(parallel, denorm_names): @@ -282,31 +282,22 @@ def test_scope_name_workarounds(): dummy_input = distiller.get_dummy_input(input_shape=(1, 100)) expected_types = ('Gemm', 'Relu', 'Gemm', 'Relu', 'Gemm') - # We have workarounds for 2 issues: - # 1. GEMM ops get the scope name of the op that came before them - # 2. Ops that come before a dropout op get the scope name of the dropout op - # If both conditions apply, empirically that #2 is the issue that manifests + # We have a workaround for the following issue: + # (used to be 2 issues but one got fixed in PyTorch 1.2) + # * Ops that come before a dropout op get the scope name of the dropout op # For the model above we expect the ops in the graph to be named (in order): # 'fc1', 'relu1', 'fc2', 'relu2', 'fc3' # (note that dropout ops are dropped) # - # But without our workarounds in place, we'll get: - # 'drop1', 'drop2', 'drop2__1', 'relu2', 'drop3' - # - # What happens is: - # * 'fc1' - issue #1 applies, so 'fc1' --> 'drop1' - # * 'relu1' - issue #2 applies, so 'relu1' --> 'drop2' - # * 'fc2' - issue #1 applies, so 'fc1' --> 'drop2__1' ('__1' suffix because 'drop2' already exists) - # * 'relu2' should be ok as-is - # * 'fc3' is susceptible to both issues - it's a GEMM op AND it comes before a dropout. As mentioned above, - # issue #2 "wins", so 'fc3' --> 'drop3' + # But since 'relu1' and 'fc3' come before a dropout op, without the workaround in place we'll get: + # 'fc1', 'drop2', 'fc2', 'relu2', 'drop3' # We test without the workarounds as a means to see if the issues still exist. New PyTorch versions # may fix them, in which case we can remove the workarounds sg = SummaryGraph(m, dummy_input, apply_scope_name_workarounds=False) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) - assert names == ('drop1', 'drop2', 'drop2_Gemm_1', 'relu2', 'drop3') + assert names == ('fc1', 'drop2', 'fc2', 'relu2', 'drop3') assert types == expected_types # Now test with the workarounds