From fdb12eb16100f4964eb9b6afd4b42ed08c29e313 Mon Sep 17 00:00:00 2001 From: Guy Jacob <guy.jacob@intel.com> Date: Thu, 31 Oct 2019 14:18:58 +0200 Subject: [PATCH] Re-enable NLP quant notebooks after following #402 (#412) * Add blacklist to quantizer. In PTQ put Dropout on the blacklist. * Update notebooks to use 2-phase stats collection * Other small fixes --- distiller/quantization/quantizer.py | 4 + distiller/quantization/range_linear.py | 1 + ...tats.yaml => acts_quantization_stats.yaml} | 337 +++++++++++++++++- examples/GNMT/quantize_gnmt.ipynb | 321 ++++++++++------- ...tats.yaml => acts_quantization_stats.yaml} | 0 .../word_language_model/quantize_lstm.ipynb | 148 +++++--- 6 files changed, 631 insertions(+), 180 deletions(-) rename examples/GNMT/{model_stats.yaml => acts_quantization_stats.yaml} (89%) rename examples/word_language_model/{manual_lstm_pretrained_stats.yaml => acts_quantization_stats.yaml} (100%) diff --git a/distiller/quantization/quantizer.py b/distiller/quantization/quantizer.py index 84d0a24..6f4943c 100644 --- a/distiller/quantization/quantizer.py +++ b/distiller/quantization/quantizer.py @@ -177,6 +177,7 @@ class Quantizer(object): # Unspecified layer types return None by default. self.replacement_factory = OrderedDict([(nn.Identity, None)]) self.default_repalcement_fn = None + self.replacement_blacklist = [] # Pointer to parameters quantization function, triggered during training process # To be populated by child classes self.param_quantization_fn = None @@ -276,6 +277,9 @@ class Quantizer(object): # Iterate through model, insert quantization functions as appropriate for name, module in container.named_children(): full_name = prefix + name + if isinstance(module, tuple(self.replacement_blacklist)): + replace_msg(full_name) + continue if module in self.modules_processed: previous_name, previous_wrapper = self.modules_processed[module] warnings.warn("Module '{0}' references to same module as '{1}'." diff --git a/distiller/quantization/range_linear.py b/distiller/quantization/range_linear.py index 703e872..37101a8 100644 --- a/distiller/quantization/range_linear.py +++ b/distiller/quantization/range_linear.py @@ -1210,6 +1210,7 @@ class PostTrainLinearQuantizer(Quantizer): self.replacement_factory[nn.Embedding] = replace_embedding self.default_repalcement_fn = replace_fake_quant + self.replacement_blacklist.append(nn.Dropout) save_dir = msglogger.logdir if hasattr(msglogger, 'logdir') else '.' self.save_per_layer_parameters(save_dir) diff --git a/examples/GNMT/model_stats.yaml b/examples/GNMT/acts_quantization_stats.yaml similarity index 89% rename from examples/GNMT/model_stats.yaml rename to examples/GNMT/acts_quantization_stats.yaml index 32de9d7..cbfec89 100644 --- a/examples/GNMT/model_stats.yaml +++ b/examples/GNMT/acts_quantization_stats.yaml @@ -7,6 +7,7 @@ encoder.rnn_layers.0.cells.0.fc_gate_x: avg_max: 2.8138245348501125 mean: -0.0016106524300209288 std: 0.8805053743938167 + b: 0.6866011405925363 shape: (1, 1024) output: min: -33.63774490356445 @@ -15,6 +16,7 @@ encoder.rnn_layers.0.cells.0.fc_gate_x: avg_max: 18.377735952944196 mean: -1.2778008344058638 std: 5.574481019638121 + b: 4.439704778263645 shape: (1, 4096) encoder.rnn_layers.0.cells.0.fc_gate_h: inputs: @@ -25,6 +27,7 @@ encoder.rnn_layers.0.cells.0.fc_gate_h: avg_max: 0.8228753872990404 mean: -0.002536592865063468 std: 0.23544047089016987 + b: 0.09098815764509137 shape: (1, 1024) output: min: -22.65224838256836 @@ -33,6 +36,7 @@ encoder.rnn_layers.0.cells.0.fc_gate_h: avg_max: 6.881520398276833 mean: -2.373206253470364 std: 2.8913670592601197 + b: 2.250662151870849 shape: (1, 4096) encoder.rnn_layers.0.cells.0.eltwiseadd_gate: inputs: @@ -43,6 +47,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 18.377735952944196 mean: -1.2778008344058638 std: 5.574481019638121 + b: 4.439704778263645 shape: (1, 4096) 1: min: -22.65224838256836 @@ -51,6 +56,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 6.881520398276833 mean: -2.373206253470364 std: 2.8913670592601197 + b: 2.250662151870849 shape: (1, 4096) output: min: -40.20295333862305 @@ -59,6 +65,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 18.852929802167523 mean: -3.6510070722635595 std: 6.424404119623572 + b: 5.151854954224653 shape: (1, 4096) encoder.rnn_layers.0.cells.0.act_f: inputs: @@ -69,6 +76,7 @@ encoder.rnn_layers.0.cells.0.act_f: avg_max: 11.772601394017986 mean: -7.432837164007311 std: 5.768370101504384 + b: 4.524102567588817 shape: (1, 1024) output: min: 3.4680012470380246e-18 @@ -77,6 +85,7 @@ encoder.rnn_layers.0.cells.0.act_f: avg_max: 0.9997981336035177 mean: 0.11436928112791814 std: 0.2739924793333765 + b: 0.17960153096657158 shape: (1, 1024) encoder.rnn_layers.0.cells.0.act_i: inputs: @@ -87,6 +96,7 @@ encoder.rnn_layers.0.cells.0.act_i: avg_max: 15.238747643933712 mean: -3.533190907395537 std: 5.892242776130438 + b: 4.720861847081462 shape: (1, 1024) output: min: 4.37388120446097e-15 @@ -95,6 +105,7 @@ encoder.rnn_layers.0.cells.0.act_i: avg_max: 0.9999727739693318 mean: 0.27593418032485884 std: 0.3954827759906979 + b: 0.3464240996079812 shape: (1, 1024) encoder.rnn_layers.0.cells.0.act_o: inputs: @@ -105,6 +116,7 @@ encoder.rnn_layers.0.cells.0.act_o: avg_max: 14.524603366545767 mean: -3.567072719363816 std: 5.69751302438932 + b: 4.504203562788745 shape: (1, 1024) output: min: 7.870648145337088e-16 @@ -113,6 +125,7 @@ encoder.rnn_layers.0.cells.0.act_o: avg_max: 0.9999628556888489 mean: 0.2618299175561274 std: 0.3833761347669008 + b: 0.3292445638791598 shape: (1, 1024) encoder.rnn_layers.0.cells.0.act_g: inputs: @@ -123,6 +136,7 @@ encoder.rnn_layers.0.cells.0.act_g: avg_max: 18.48774343457505 mean: -0.0709275224609481 std: 6.116932441002841 + b: 4.936153843000304 shape: (1, 1024) output: min: -1.0 @@ -131,6 +145,7 @@ encoder.rnn_layers.0.cells.0.act_g: avg_max: 0.9999999999947166 mean: -0.007298258008836419 std: 0.9385271485476292 + b: 0.917864403999836 shape: (1, 1024) encoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: inputs: @@ -141,6 +156,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 0.9997981336035177 mean: 0.11436928112791814 std: 0.2739924793333765 + b: 0.17960153096657158 shape: (1, 1024) 1: min: -38.982444763183594 @@ -149,6 +165,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 1.6211083284579975 mean: -0.00908752453454041 std: 0.5260199992435526 + b: 0.28379768503898656 shape: (1, 1024) output: min: -38.97710418701172 @@ -157,6 +174,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 1.0867458244325479 mean: -0.0052387909689317535 std: 0.24397997338840743 + b: 0.040725299804158094 shape: (1, 1024) encoder.rnn_layers.0.cells.0.eltwisemult_cell_input: inputs: @@ -167,6 +185,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9999727739693318 mean: 0.27593418032485884 std: 0.3954827759906979 + b: 0.3464240996079812 shape: (1, 1024) 1: min: -1.0 @@ -175,6 +194,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9999999999947166 mean: -0.007298258008836419 std: 0.9385271485476292 + b: 0.917864403999836 shape: (1, 1024) output: min: -1.0 @@ -183,6 +203,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9998500162423687 mean: -0.004381999838699326 std: 0.4680802328708717 + b: 0.2654821227074416 shape: (1, 1024) encoder.rnn_layers.0.cells.0.eltwiseadd_cell: inputs: @@ -193,6 +214,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 1.0867458244325479 mean: -0.0052387909689317535 std: 0.24397997338840743 + b: 0.040725299804158094 shape: (1, 1024) 1: min: -1.0 @@ -201,6 +223,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 0.9998500162423687 mean: -0.004381999838699326 std: 0.4680802328708717 + b: 0.2654821227074416 shape: (1, 1024) output: min: -39.97100830078125 @@ -209,6 +232,7 @@ encoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 1.6904500271956118 mean: -0.009620790795830529 std: 0.5374170427844072 + b: 0.294500598101108 shape: (1, 1024) encoder.rnn_layers.0.cells.0.act_h: inputs: @@ -219,6 +243,7 @@ encoder.rnn_layers.0.cells.0.act_h: avg_max: 1.6904500271956118 mean: -0.009620790795830529 std: 0.5374170427844072 + b: 0.294500598101108 shape: (1, 1024) output: min: -1.0 @@ -227,6 +252,7 @@ encoder.rnn_layers.0.cells.0.act_h: avg_max: 0.9111029639984748 mean: -0.0042562744052456435 std: 0.3862437789687602 + b: 0.23299700498688 shape: (1, 1024) encoder.rnn_layers.0.cells.0.eltwisemult_hidden: inputs: @@ -237,6 +263,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.9999628556888489 mean: 0.2618299175561274 std: 0.3833761347669008 + b: 0.3292445638791598 shape: (1, 1024) 1: min: -1.0 @@ -245,6 +272,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.9111029639984748 mean: -0.0042562744052456435 std: 0.3862437789687602 + b: 0.23299700498688 shape: (1, 1024) output: min: -1.0 @@ -253,6 +281,7 @@ encoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.8561479863309971 mean: -0.0026583670083781055 std: 0.24113962918670837 + b: 0.09507561210187652 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.fc_gate_x: inputs: @@ -263,6 +292,7 @@ encoder.rnn_layers.0.cells_reverse.0.fc_gate_x: avg_max: 2.8138245348501405 mean: -0.0016106524300209281 std: 0.8805050697722259 + b: 0.6866011405925253 shape: (1, 1024) output: min: -33.94660949707031 @@ -271,6 +301,7 @@ encoder.rnn_layers.0.cells_reverse.0.fc_gate_x: avg_max: 18.700496405858367 mean: -0.4969901222221274 std: 5.666540069661909 + b: 4.608289882615895 shape: (1, 4096) encoder.rnn_layers.0.cells_reverse.0.fc_gate_h: inputs: @@ -281,6 +312,7 @@ encoder.rnn_layers.0.cells_reverse.0.fc_gate_h: avg_max: 0.9217871793855027 mean: -0.0008159191441113994 std: 0.2933630896077876 + b: 0.140212070574075 shape: (1, 1024) output: min: -26.57025146484375 @@ -289,6 +321,7 @@ encoder.rnn_layers.0.cells_reverse.0.fc_gate_h: avg_max: 13.256404327800057 mean: -1.6347749916045144 std: 3.3098913033147026 + b: 2.551923321663966 shape: (1, 4096) encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_gate: inputs: @@ -299,6 +332,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_gate: avg_max: 18.700496405858367 mean: -0.4969901222221274 std: 5.666540069661909 + b: 4.608289882615895 shape: (1, 4096) 1: min: -26.57025146484375 @@ -307,6 +341,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_gate: avg_max: 13.256404327800057 mean: -1.6347749916045144 std: 3.3098913033147026 + b: 2.551923321663966 shape: (1, 4096) output: min: -38.69415283203125 @@ -315,6 +350,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_gate: avg_max: 21.477238427920927 mean: -2.131765110762214 std: 6.632126340207978 + b: 5.37469070170134 shape: (1, 4096) encoder.rnn_layers.0.cells_reverse.0.act_f: inputs: @@ -325,6 +361,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_f: avg_max: 17.50440279642364 mean: -3.8737916625820072 std: 6.84193347529921 + b: 5.51798325095842 shape: (1, 1024) output: min: 2.5389102872982082e-17 @@ -333,6 +370,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_f: avg_max: 0.999999346159746 mean: 0.2890327369231582 std: 0.4023833394248511 + b: 0.3566244444111821 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.act_i: inputs: @@ -343,6 +381,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_i: avg_max: 16.691689617168617 mean: -3.073931181881033 std: 6.2677184148953256 + b: 5.060755877760605 shape: (1, 1024) output: min: 2.3756509732875972e-17 @@ -351,6 +390,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_i: avg_max: 0.9999988585396135 mean: 0.3108025884336769 std: 0.4110379687299371 + b: 0.3706470294060169 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.act_o: inputs: @@ -361,6 +401,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_o: avg_max: 17.492547388038638 mean: -1.5878526072804218 std: 6.0648371330715545 + b: 4.911654308863885 shape: (1, 1024) output: min: 1.5679886799044933e-17 @@ -369,6 +410,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_o: avg_max: 0.9999994433218858 mean: 0.39423968600374154 std: 0.4279581642861166 + b: 0.402148790720451 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.act_g: inputs: @@ -379,6 +421,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_g: avg_max: 20.84152177737822 mean: 0.008514990976159644 std: 6.647000615879316 + b: 5.393258296043739 shape: (1, 1024) output: min: -1.0 @@ -387,6 +430,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_g: avg_max: 1.0 mean: 0.0014180161330502351 std: 0.9442339332189296 + b: 0.9252437063664418 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_forget: inputs: @@ -397,6 +441,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_forget: avg_max: 0.999999346159746 mean: 0.2890327369231582 std: 0.4023833394248511 + b: 0.3566244444111821 shape: (1, 1024) 1: min: -29.373083114624023 @@ -405,6 +450,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_forget: avg_max: 3.9063054220693965 mean: -0.0008279334693150963 std: 0.6366862423327254 + b: 0.38663300925836047 shape: (1, 1024) output: min: -29.373075485229492 @@ -413,6 +459,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_forget: avg_max: 3.729673293131146 mean: 0.00015181826198886874 std: 0.3949363630374722 + b: 0.13394309130835125 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_input: inputs: @@ -423,6 +470,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_input: avg_max: 0.9999988585396135 mean: 0.3108025884336769 std: 0.4110379687299371 + b: 0.3706470294060169 shape: (1, 1024) 1: min: -1.0 @@ -431,6 +479,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_input: avg_max: 1.0 mean: 0.0014180161330502351 std: 0.9442339332189296 + b: 0.9252437063664418 shape: (1, 1024) output: min: -1.0 @@ -439,6 +488,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_cell_input: avg_max: 0.9999928301513564 mean: -0.0005279613764160218 std: 0.5003141965724145 + b: 0.29767097203335213 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_cell: inputs: @@ -449,6 +499,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_cell: avg_max: 3.729673293131146 mean: 0.00015181826198886874 std: 0.3949363630374722 + b: 0.13394309130835125 shape: (1, 1024) 1: min: -1.0 @@ -457,6 +508,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_cell: avg_max: 0.9999928301513564 mean: -0.0005279613764160218 std: 0.5003141965724145 + b: 0.29767097203335213 shape: (1, 1024) output: min: -29.373083114624023 @@ -465,6 +517,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwiseadd_cell: avg_max: 4.100828251393332 mean: -0.00037614313566861415 std: 0.6483412291978414 + b: 0.4005083799864954 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.act_h: inputs: @@ -475,6 +528,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_h: avg_max: 4.100828251393332 mean: -0.00037614313566861415 std: 0.6483412291978414 + b: 0.4005083799864954 shape: (1, 1024) output: min: -1.0 @@ -483,6 +537,7 @@ encoder.rnn_layers.0.cells_reverse.0.act_h: avg_max: 0.9818776522137024 mean: -0.0013970042477792044 std: 0.45158473029410123 + b: 0.30759742446588034 shape: (1, 1024) encoder.rnn_layers.0.cells_reverse.0.eltwisemult_hidden: inputs: @@ -493,6 +548,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_hidden: avg_max: 0.9999994433218858 mean: 0.39423968600374154 std: 0.4279581642861166 + b: 0.402148790720451 shape: (1, 1024) 1: min: -1.0 @@ -501,6 +557,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_hidden: avg_max: 0.9818776522137024 mean: -0.0013970042477792044 std: 0.45158473029410123 + b: 0.30759742446588034 shape: (1, 1024) output: min: -1.0 @@ -509,6 +566,7 @@ encoder.rnn_layers.0.cells_reverse.0.eltwisemult_hidden: avg_max: 0.9564541215110065 mean: -0.0005957838928230832 std: 0.2985633225360773 + b: 0.1449251490997063 shape: (1, 1024) encoder.rnn_layers.0.dropout: output: @@ -518,6 +576,7 @@ encoder.rnn_layers.0.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' encoder.rnn_layers.1.cells.0.fc_gate_x: inputs: @@ -528,6 +587,7 @@ encoder.rnn_layers.1.cells.0.fc_gate_x: avg_max: 0.3723408187603086 mean: -0.0006261473494000447 std: 0.1684595854192406 + b: 0.046348249174828215 shape: (128, 2048) output: min: -28.018692016601562 @@ -536,6 +596,7 @@ encoder.rnn_layers.1.cells.0.fc_gate_x: avg_max: 4.8616450890998655 mean: -0.5998025477032523 std: 2.1475245530098204 + b: 1.30492351677527 shape: (128, 4096) encoder.rnn_layers.1.cells.0.fc_gate_h: inputs: @@ -546,6 +607,7 @@ encoder.rnn_layers.1.cells.0.fc_gate_h: avg_max: 0.9067187918312595 mean: -0.00016366511202441184 std: 0.17426053388600266 + b: 0.08171782312204447 shape: (128, 1024) output: min: -22.963897705078125 @@ -554,6 +616,7 @@ encoder.rnn_layers.1.cells.0.fc_gate_h: avg_max: 8.383283630046785 mean: -0.9507504072732051 std: 1.9114254594378783 + b: 1.4288094437429024 shape: (128, 4096) encoder.rnn_layers.1.cells.0.eltwiseadd_gate: inputs: @@ -564,6 +627,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 4.8616450890998655 mean: -0.5998025477032523 std: 2.1475245530098204 + b: 1.30492351677527 shape: (128, 4096) 1: min: -22.963897705078125 @@ -572,6 +636,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 8.383283630046785 mean: -0.9507504072732051 std: 1.9114254594378783 + b: 1.4288094437429024 shape: (128, 4096) output: min: -31.574262619018555 @@ -580,6 +645,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 10.518839543297785 mean: -1.5505529552098096 std: 3.1590192373764134 + b: 2.308331574302921 shape: (128, 4096) encoder.rnn_layers.1.cells.0.act_f: inputs: @@ -590,6 +656,7 @@ encoder.rnn_layers.1.cells.0.act_f: avg_max: 8.090042744440224 mean: -3.3116955970157718 std: 3.4230677646590277 + b: 2.4293571345891474 shape: (128, 1024) output: min: 1.9385273957893065e-14 @@ -598,6 +665,7 @@ encoder.rnn_layers.1.cells.0.act_f: avg_max: 0.9984393432169701 mean: 0.16464387024575694 std: 0.2689671071958494 + b: 0.19436692710657527 shape: (128, 1024) encoder.rnn_layers.1.cells.0.act_i: inputs: @@ -608,6 +676,7 @@ encoder.rnn_layers.1.cells.0.act_i: avg_max: 6.228059029819189 mean: -1.7129896603174661 std: 2.7638664904974917 + b: 2.0058580389625518 shape: (128, 1024) output: min: 2.1981662367068222e-13 @@ -616,6 +685,7 @@ encoder.rnn_layers.1.cells.0.act_i: avg_max: 0.9786340831643511 mean: 0.2868795179297321 std: 0.27602027714229127 + b: 0.22691229005787997 shape: (128, 1024) encoder.rnn_layers.1.cells.0.act_o: inputs: @@ -626,6 +696,7 @@ encoder.rnn_layers.1.cells.0.act_o: avg_max: 9.042313837365008 mean: -1.2052700564898609 std: 2.6057113237143055 + b: 1.8401764845288056 shape: (128, 1024) output: min: 6.9000694914722605e-12 @@ -634,6 +705,7 @@ encoder.rnn_layers.1.cells.0.act_o: avg_max: 0.9974105607476547 mean: 0.33604429936415675 std: 0.2886479601419748 + b: 0.24216563336267813 shape: (128, 1024) encoder.rnn_layers.1.cells.0.act_g: inputs: @@ -644,6 +716,7 @@ encoder.rnn_layers.1.cells.0.act_g: avg_max: 8.65820987619275 mean: 0.027743494574744162 std: 2.8342722477735816 + b: 1.8500435286986996 shape: (128, 1024) output: min: -1.0 @@ -652,6 +725,7 @@ encoder.rnn_layers.1.cells.0.act_g: avg_max: 0.9996711071705672 mean: 0.015988719393334825 std: 0.7484585498246807 + b: 0.6760022391875584 shape: (128, 1024) encoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: inputs: @@ -662,6 +736,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 0.9984393432169701 mean: 0.16464387024575694 std: 0.2689671071958494 + b: 0.19436692710657527 shape: (128, 1024) 1: min: -86.69337463378906 @@ -670,6 +745,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 12.752722294005268 mean: -0.023152882641702514 std: 1.2572998992641964 + b: 0.3008873190551628 shape: (128, 1024) output: min: -86.67153930664062 @@ -678,6 +754,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 12.574206966781782 mean: -0.030583657865831754 std: 1.2066110910522718 + b: 0.1395411149794506 shape: (128, 1024) encoder.rnn_layers.1.cells.0.eltwisemult_cell_input: inputs: @@ -688,6 +765,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.9786340831643511 mean: 0.2868795179297321 std: 0.27602027714229127 + b: 0.22691229005787997 shape: (128, 1024) 1: min: -1.0 @@ -696,6 +774,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.9996711071705672 mean: 0.015988719393334825 std: 0.7484585498246807 + b: 0.6760022391875584 shape: (128, 1024) output: min: -1.0 @@ -704,6 +783,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.9683758158411752 mean: 0.006315402761504094 std: 0.3088953017599018 + b: 0.18894667950535882 shape: (128, 1024) encoder.rnn_layers.1.cells.0.eltwiseadd_cell: inputs: @@ -714,6 +794,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 12.574206966781782 mean: -0.030583657865831754 std: 1.2066110910522718 + b: 0.1395411149794506 shape: (128, 1024) 1: min: -1.0 @@ -722,6 +803,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 0.9683758158411752 mean: 0.006315402761504094 std: 0.3088953017599018 + b: 0.18894667950535882 shape: (128, 1024) output: min: -87.62094116210938 @@ -730,6 +812,7 @@ encoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 13.23383954467387 mean: -0.024268255073244073 std: 1.2882582959130635 + b: 0.30631777404138666 shape: (128, 1024) encoder.rnn_layers.1.cells.0.act_h: inputs: @@ -740,6 +823,7 @@ encoder.rnn_layers.1.cells.0.act_h: avg_max: 13.23383954467387 mean: -0.024268255073244073 std: 1.2882582959130635 + b: 0.30631777404138666 shape: (128, 1024) output: min: -1.0 @@ -748,6 +832,7 @@ encoder.rnn_layers.1.cells.0.act_h: avg_max: 0.982027827093264 mean: 0.005185764388532731 std: 0.3119488266195677 + b: 0.20768493218909959 shape: (128, 1024) encoder.rnn_layers.1.cells.0.eltwisemult_hidden: inputs: @@ -758,6 +843,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.9974105607476547 mean: 0.33604429936415675 std: 0.2886479601419748 + b: 0.24216563336267813 shape: (128, 1024) 1: min: -1.0 @@ -766,6 +852,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.982027827093264 mean: 0.005185764388532731 std: 0.3119488266195677 + b: 0.20768493218909959 shape: (128, 1024) output: min: -0.9999995231628418 @@ -774,6 +861,7 @@ encoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.9199835463058242 mean: -0.00017594442323635664 std: 0.17510466702537447 + b: 0.0827674181820404 shape: (128, 1024) encoder.rnn_layers.1.dropout: output: @@ -783,6 +871,7 @@ encoder.rnn_layers.1.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' encoder.rnn_layers.2.cells.0.fc_gate_x: inputs: @@ -793,6 +882,7 @@ encoder.rnn_layers.2.cells.0.fc_gate_x: avg_max: 0.9199835463058242 mean: -0.00017594442323635664 std: 0.17510466702537447 + b: 0.0827674181820404 shape: (128, 1024) output: min: -17.669666290283203 @@ -801,6 +891,7 @@ encoder.rnn_layers.2.cells.0.fc_gate_x: avg_max: 5.579422310161379 mean: -0.7621265583790381 std: 1.6214516118322957 + b: 1.2066256898628278 shape: (128, 4096) encoder.rnn_layers.2.cells.0.fc_gate_h: inputs: @@ -811,6 +902,7 @@ encoder.rnn_layers.2.cells.0.fc_gate_h: avg_max: 0.8692965302824718 mean: -0.002193973255493973 std: 0.13664756692926694 + b: 0.054064400482378906 shape: (128, 1024) output: min: -12.899184226989746 @@ -819,6 +911,7 @@ encoder.rnn_layers.2.cells.0.fc_gate_h: avg_max: 6.135640713992537 mean: -0.7798955419852971 std: 1.2988276201978577 + b: 1.0101514969362784 shape: (128, 4096) encoder.rnn_layers.2.cells.0.eltwiseadd_gate: inputs: @@ -829,6 +922,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 5.579422310161379 mean: -0.7621265583790381 std: 1.6214516118322957 + b: 1.2066256898628278 shape: (128, 4096) 1: min: -12.899184226989746 @@ -837,6 +931,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 6.135640713992537 mean: -0.7798955419852971 std: 1.2988276201978577 + b: 1.0101514969362784 shape: (128, 4096) output: min: -22.86528968811035 @@ -845,6 +940,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 10.091681053441096 mean: -1.5420221014644224 std: 2.4497739415101045 + b: 1.9066353796312498 shape: (128, 4096) encoder.rnn_layers.2.cells.0.act_f: inputs: @@ -855,6 +951,7 @@ encoder.rnn_layers.2.cells.0.act_f: avg_max: 9.76669416798307 mean: -1.7373663354206654 std: 2.701828662299363 + b: 2.0874998922182675 shape: (128, 1024) output: min: 1.1741696503975163e-10 @@ -863,6 +960,7 @@ encoder.rnn_layers.2.cells.0.act_f: avg_max: 0.9971977971010819 mean: 0.2779621332383798 std: 0.3120235028974232 + b: 0.2603461732650365 shape: (128, 1024) encoder.rnn_layers.2.cells.0.act_i: inputs: @@ -873,6 +971,7 @@ encoder.rnn_layers.2.cells.0.act_i: avg_max: 5.007397494993483 mean: -2.482964531553934 std: 2.096762127403773 + b: 1.577551453372244 shape: (128, 1024) output: min: 3.632128597885753e-09 @@ -881,6 +980,7 @@ encoder.rnn_layers.2.cells.0.act_i: avg_max: 0.9815354767928449 mean: 0.17026146355457997 std: 0.22359512877822252 + b: 0.16413162317818702 shape: (128, 1024) encoder.rnn_layers.2.cells.0.act_o: inputs: @@ -891,6 +991,7 @@ encoder.rnn_layers.2.cells.0.act_o: avg_max: 6.900344703221481 mean: -1.9338501979173957 std: 2.188371648150393 + b: 1.6447856127549068 shape: (128, 1024) output: min: 1.0016504292664763e-09 @@ -899,6 +1000,7 @@ encoder.rnn_layers.2.cells.0.act_o: avg_max: 0.996341593846912 mean: 0.2302334359294881 std: 0.2644176532615949 + b: 0.20757872787548 shape: (128, 1024) encoder.rnn_layers.2.cells.0.act_g: inputs: @@ -909,6 +1011,7 @@ encoder.rnn_layers.2.cells.0.act_g: avg_max: 6.198857292499591 mean: -0.013907334995555003 std: 2.0268158859398273 + b: 1.5166460454830637 shape: (128, 1024) output: min: -1.0 @@ -917,6 +1020,7 @@ encoder.rnn_layers.2.cells.0.act_g: avg_max: 0.999913128430412 mean: 0.006783831941798703 std: 0.7667732969711837 + b: 0.7053634235156199 shape: (128, 1024) encoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: inputs: @@ -927,6 +1031,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 0.9971977971010819 mean: 0.2779621332383798 std: 0.3120235028974232 + b: 0.2603461732650365 shape: (128, 1024) 1: min: -70.65399169921875 @@ -935,6 +1040,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 11.433578196754656 mean: -0.007533524133234033 std: 0.9619221798317081 + b: 0.2497865464606052 shape: (128, 1024) output: min: -70.47572326660156 @@ -943,6 +1049,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 11.29573609329911 mean: -0.0077121059270617645 std: 0.9119806954490677 + b: 0.13775959665708915 shape: (128, 1024) encoder.rnn_layers.2.cells.0.eltwisemult_cell_input: inputs: @@ -953,6 +1060,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.9815354767928449 mean: 0.17026146355457997 std: 0.22359512877822252 + b: 0.16413162317818702 shape: (128, 1024) 1: min: -1.0 @@ -961,6 +1069,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.999913128430412 mean: 0.006783831941798703 std: 0.7667732969711837 + b: 0.7053634235156199 shape: (128, 1024) output: min: -1.0 @@ -969,6 +1078,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.9338277108983972 mean: 0.00031330419493738533 std: 0.23285220882823307 + b: 0.12565061154356547 shape: (128, 1024) encoder.rnn_layers.2.cells.0.eltwiseadd_cell: inputs: @@ -979,6 +1089,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 11.29573609329911 mean: -0.0077121059270617645 std: 0.9119806954490677 + b: 0.13775959665708915 shape: (128, 1024) 1: min: -1.0 @@ -987,6 +1098,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 0.9338277108983972 mean: 0.00031330419493738533 std: 0.23285220882823307 + b: 0.12565061154356547 shape: (128, 1024) output: min: -71.34825897216797 @@ -995,6 +1107,7 @@ encoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 11.882522354690009 mean: -0.007398801722887531 std: 0.9852414934516581 + b: 0.2546569724510982 shape: (128, 1024) encoder.rnn_layers.2.cells.0.act_h: inputs: @@ -1005,6 +1118,7 @@ encoder.rnn_layers.2.cells.0.act_h: avg_max: 11.882522354690009 mean: -0.007398801722887531 std: 0.9852414934516581 + b: 0.2546569724510982 shape: (128, 1024) output: min: -1.0 @@ -1013,6 +1127,7 @@ encoder.rnn_layers.2.cells.0.act_h: avg_max: 0.9643497033340547 mean: -0.0026311075007701672 std: 0.2839022589840224 + b: 0.17143513807314364 shape: (128, 1024) encoder.rnn_layers.2.cells.0.eltwisemult_hidden: inputs: @@ -1023,6 +1138,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.996341593846912 mean: 0.2302334359294881 std: 0.2644176532615949 + b: 0.20757872787548 shape: (128, 1024) 1: min: -1.0 @@ -1031,6 +1147,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.9643497033340547 mean: -0.0026311075007701672 std: 0.2839022589840224 + b: 0.17143513807314364 shape: (128, 1024) output: min: -0.9999260902404785 @@ -1039,6 +1156,7 @@ encoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.8823925270396857 mean: -0.002203096514809122 std: 0.1377186248199642 + b: 0.05484113178447183 shape: (128, 1024) encoder.rnn_layers.2.dropout: output: @@ -1048,6 +1166,7 @@ encoder.rnn_layers.2.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' encoder.rnn_layers.3.cells.0.fc_gate_x: inputs: @@ -1058,6 +1177,7 @@ encoder.rnn_layers.3.cells.0.fc_gate_x: avg_max: 1.4346018574915211 mean: -0.0023790409310782876 std: 0.23082689970193895 + b: 0.12216927285224903 shape: (128, 1024) output: min: -18.958541870117188 @@ -1066,6 +1186,7 @@ encoder.rnn_layers.3.cells.0.fc_gate_x: avg_max: 7.386176010906286 mean: -0.7425925570313029 std: 1.9741718173439569 + b: 1.4889423477169665 shape: (128, 4096) encoder.rnn_layers.3.cells.0.fc_gate_h: inputs: @@ -1076,6 +1197,7 @@ encoder.rnn_layers.3.cells.0.fc_gate_h: avg_max: 0.9680624933584159 mean: -5.464283497081485e-05 std: 0.1605789723297388 + b: 0.056299100504511694 shape: (128, 1024) output: min: -17.731414794921875 @@ -1084,6 +1206,7 @@ encoder.rnn_layers.3.cells.0.fc_gate_h: avg_max: 8.483382081558762 mean: -1.0937484681389607 std: 1.8863347454511785 + b: 1.4338658784393215 shape: (128, 4096) encoder.rnn_layers.3.cells.0.eltwiseadd_gate: inputs: @@ -1094,6 +1217,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_gate: avg_max: 7.386176010906286 mean: -0.7425925570313029 std: 1.9741718173439569 + b: 1.4889423477169665 shape: (128, 4096) 1: min: -17.731414794921875 @@ -1102,6 +1226,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_gate: avg_max: 8.483382081558762 mean: -1.0937484681389607 std: 1.8863347454511785 + b: 1.4338658784393215 shape: (128, 4096) output: min: -25.56599998474121 @@ -1110,6 +1235,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_gate: avg_max: 13.726210550973875 mean: -1.8363410236118098 std: 3.220885522048646 + b: 2.500927935644017 shape: (128, 4096) encoder.rnn_layers.3.cells.0.act_f: inputs: @@ -1120,6 +1246,7 @@ encoder.rnn_layers.3.cells.0.act_f: avg_max: 11.459924239973608 mean: -1.7886736150289273 std: 3.611997766132868 + b: 2.852256792237028 shape: (128, 1024) output: min: 7.885464850532209e-12 @@ -1128,6 +1255,7 @@ encoder.rnn_layers.3.cells.0.act_f: avg_max: 0.9981180747150038 mean: 0.3085659271068626 std: 0.362517692693201 + b: 0.3166746377811603 shape: (128, 1024) encoder.rnn_layers.3.cells.0.act_i: inputs: @@ -1138,6 +1266,7 @@ encoder.rnn_layers.3.cells.0.act_i: avg_max: 5.387517666390012 mean: -3.4113261673007766 std: 2.452103517547047 + b: 1.8651447216136345 shape: (128, 1024) output: min: 3.070241560987341e-10 @@ -1146,6 +1275,7 @@ encoder.rnn_layers.3.cells.0.act_i: avg_max: 0.9767627610722911 mean: 0.1199813902886455 std: 0.20597901242826788 + b: 0.13689427367376142 shape: (128, 1024) encoder.rnn_layers.3.cells.0.act_o: inputs: @@ -1156,6 +1286,7 @@ encoder.rnn_layers.3.cells.0.act_o: avg_max: 12.915865638912122 mean: -2.1100439747307944 std: 2.7725332585946427 + b: 2.0674156388580402 shape: (128, 1024) output: min: 3.422208905146107e-10 @@ -1164,6 +1295,7 @@ encoder.rnn_layers.3.cells.0.act_o: avg_max: 0.9998594416467933 mean: 0.23564658800694233 std: 0.2900290182410238 + b: 0.23153820739729833 shape: (128, 1024) encoder.rnn_layers.3.cells.0.act_g: inputs: @@ -1174,6 +1306,7 @@ encoder.rnn_layers.3.cells.0.act_g: avg_max: 10.109944212490001 mean: -0.035320331854928055 std: 2.991404617719586 + b: 2.2730739073332047 shape: (128, 1024) output: min: -1.0 @@ -1182,6 +1315,7 @@ encoder.rnn_layers.3.cells.0.act_g: avg_max: 0.9999893591374617 mean: -0.017447813011902232 std: 0.8383612744760685 + b: 0.7895965214810386 shape: (128, 1024) encoder.rnn_layers.3.cells.0.eltwisemult_cell_forget: inputs: @@ -1192,6 +1326,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_forget: avg_max: 0.9981180747150038 mean: 0.3085659271068626 std: 0.362517692693201 + b: 0.3166746377811603 shape: (128, 1024) 1: min: -72.4426498413086 @@ -1200,6 +1335,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_forget: avg_max: 12.621639567200226 mean: 0.001298973259466056 std: 1.3620516853160491 + b: 0.3307023400703073 shape: (128, 1024) output: min: -72.35834503173828 @@ -1208,6 +1344,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_forget: avg_max: 12.483070900776225 mean: -0.0005504995357517113 std: 1.3219009414904979 + b: 0.24167430174629564 shape: (128, 1024) encoder.rnn_layers.3.cells.0.eltwisemult_cell_input: inputs: @@ -1218,6 +1355,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_input: avg_max: 0.9767627610722911 mean: 0.1199813902886455 std: 0.20597901242826788 + b: 0.13689427367376142 shape: (128, 1024) 1: min: -1.0 @@ -1226,6 +1364,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_input: avg_max: 0.9999893591374617 mean: -0.017447813011902232 std: 0.8383612744760685 + b: 0.7895965214810386 shape: (128, 1024) output: min: -1.0 @@ -1234,6 +1373,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_cell_input: avg_max: 0.9557934822118774 mean: 0.0013386550747985504 std: 0.21654626606567212 + b: 0.1011717223897744 shape: (128, 1024) encoder.rnn_layers.3.cells.0.eltwiseadd_cell: inputs: @@ -1244,6 +1384,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_cell: avg_max: 12.483070900776225 mean: -0.0005504995357517113 std: 1.3219009414904979 + b: 0.24167430174629564 shape: (128, 1024) 1: min: -1.0 @@ -1252,6 +1393,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_cell: avg_max: 0.9557934822118774 mean: 0.0013386550747985504 std: 0.21654626606567212 + b: 0.1011717223897744 shape: (128, 1024) output: min: -73.15592956542969 @@ -1260,6 +1402,7 @@ encoder.rnn_layers.3.cells.0.eltwiseadd_cell: avg_max: 12.986537455179008 mean: 0.0007881555251539649 std: 1.3934345367324847 + b: 0.33818189893126244 shape: (128, 1024) encoder.rnn_layers.3.cells.0.act_h: inputs: @@ -1270,6 +1413,7 @@ encoder.rnn_layers.3.cells.0.act_h: avg_max: 12.986537455179008 mean: 0.0007881555251539649 std: 1.3934345367324847 + b: 0.33818189893126244 shape: (128, 1024) output: min: -1.0 @@ -1278,6 +1422,7 @@ encoder.rnn_layers.3.cells.0.act_h: avg_max: 0.9954337507019633 mean: 0.005215724385428704 std: 0.3152480532527837 + b: 0.17376577470793259 shape: (128, 1024) encoder.rnn_layers.3.cells.0.eltwisemult_hidden: inputs: @@ -1288,6 +1433,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_hidden: avg_max: 0.9998594416467933 mean: 0.23564658800694233 std: 0.2900290182410238 + b: 0.23153820739729833 shape: (128, 1024) 1: min: -1.0 @@ -1296,6 +1442,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_hidden: avg_max: 0.9954337507019633 mean: 0.005215724385428704 std: 0.3152480532527837 + b: 0.17376577470793259 shape: (128, 1024) output: min: -0.9999996423721313 @@ -1304,6 +1451,7 @@ encoder.rnn_layers.3.cells.0.eltwisemult_hidden: avg_max: 0.981412514344157 mean: -8.919282888996098e-05 std: 0.16194114228528503 + b: 0.05719050604400012 shape: (128, 1024) encoder.rnn_layers.3.dropout: output: @@ -1313,6 +1461,7 @@ encoder.rnn_layers.3.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' encoder.dropout: inputs: @@ -1323,6 +1472,7 @@ encoder.dropout: avg_max: 1.2545068023933308 mean: -0.0010675477744067996 std: 0.19403692465489641 + b: 0.08413264895271927 shape: (128, 67, 2048) output: min: -1.999041199684143 @@ -1331,16 +1481,18 @@ encoder.dropout: avg_max: 1.2545068023933308 mean: -0.0010675477744067996 std: 0.19403692465489641 + b: 0.08413264895271927 shape: (128, 67, 2048) encoder.embedder: inputs: 0: - min: 0 - max: 32103 - avg_min: 3.2019918366436153 - avg_max: 24950.48025676814 - mean: 2937.1528870343564 - std: 6014.0608835026405 + min: 0.0 + max: 32103.0 + avg_min: 3.201991836643651 + avg_max: 24950.480256547948 + mean: 2937.152918147278 + std: 6014.049883398042 + b: 4064.187827708714 shape: (128, 67) output: min: -5.349642276763916 @@ -1349,6 +1501,7 @@ encoder.embedder: avg_max: 2.8868405888984556 mean: 0.0012734194185268903 std: 0.8584338496588495 + b: 0.6693653134823229 shape: (128, 67, 1024) encoder.eltwiseadd_residuals.0: inputs: @@ -1359,6 +1512,7 @@ encoder.eltwiseadd_residuals.0: avg_max: 0.9888223583499591 mean: -0.0022379238847255083 std: 0.13743977474031957 + b: 0.05472927090401451 shape: (128, 67, 1024) 1: min: -0.9999995231628418 @@ -1367,6 +1521,7 @@ encoder.eltwiseadd_residuals.0: avg_max: 0.995096003015836 mean: -0.0001647915998243358 std: 0.17545655249766362 + b: 0.082812680862844 shape: (128, 67, 1024) output: min: -1.999041199684143 @@ -1375,6 +1530,7 @@ encoder.eltwiseadd_residuals.0: avg_max: 1.7707290103038151 mean: -0.0024027154916742197 std: 0.2309446271021139 + b: 0.12214652976642053 shape: (128, 67, 1024) encoder.eltwiseadd_residuals.1: inputs: @@ -1385,6 +1541,7 @@ encoder.eltwiseadd_residuals.1: avg_max: 0.9982590352495511 mean: -4.938640427099017e-05 std: 0.16142380765711173 + b: 0.05695545502627889 shape: (128, 67, 1024) 1: min: -1.999041199684143 @@ -1393,6 +1550,7 @@ encoder.eltwiseadd_residuals.1: avg_max: 1.7707290103038151 mean: -0.0024027154916742197 std: 0.2309446271021139 + b: 0.12214652976642053 shape: (128, 67, 1024) output: min: -2.9931235313415527 @@ -1401,6 +1559,7 @@ encoder.eltwiseadd_residuals.1: avg_max: 2.5017193754514055 mean: -0.0024521019076928496 std: 0.2936707415080642 + b: 0.1581717822700739 shape: (128, 67, 1024) decoder.att_rnn.rnn.cells.0.fc_gate_x: inputs: @@ -1411,6 +1570,7 @@ decoder.att_rnn.rnn.cells.0.fc_gate_x: avg_max: 2.8718470222494563 mean: 0.0012989030032717187 std: 0.8618320366411912 + b: 0.6747593729013805 shape: (1280, 1024) output: min: -32.593318939208984 @@ -1419,6 +1579,7 @@ decoder.att_rnn.rnn.cells.0.fc_gate_x: avg_max: 18.39185942424816 mean: -0.319708395848218 std: 5.248436831320663 + b: 4.172536984186494 shape: (1280, 4096) decoder.att_rnn.rnn.cells.0.fc_gate_h: inputs: @@ -1429,6 +1590,7 @@ decoder.att_rnn.rnn.cells.0.fc_gate_h: avg_max: 0.9817781538775797 mean: 0.007541471822687429 std: 0.3220154108800773 + b: 0.1654869589093926 shape: (1280, 1024) output: min: -35.26850891113281 @@ -1437,6 +1599,7 @@ decoder.att_rnn.rnn.cells.0.fc_gate_h: avg_max: 22.16190616622404 mean: -1.9407802034043842 std: 4.835737332097758 + b: 3.6179673783564827 shape: (1280, 4096) decoder.att_rnn.rnn.cells.0.eltwiseadd_gate: inputs: @@ -1447,6 +1610,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_gate: avg_max: 18.39185942424816 mean: -0.319708395848218 std: 5.248436831320663 + b: 4.172536984186494 shape: (1280, 4096) 1: min: -35.26850891113281 @@ -1455,6 +1619,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_gate: avg_max: 22.16190616622404 mean: -1.9407802034043842 std: 4.835737332097758 + b: 3.6179673783564827 shape: (1280, 4096) output: min: -45.18857192993164 @@ -1463,6 +1628,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_gate: avg_max: 27.033264856660004 mean: -2.260488603289206 std: 7.086690069945273 + b: 5.629252473959749 shape: (1280, 4096) decoder.att_rnn.rnn.cells.0.act_f: inputs: @@ -1473,6 +1639,7 @@ decoder.att_rnn.rnn.cells.0.act_f: avg_max: 24.47041631977212 mean: -5.428667449415389 std: 7.5857557123719195 + b: 5.942447403307717 shape: (1280, 1024) output: min: 5.720600170657743e-19 @@ -1481,6 +1648,7 @@ decoder.att_rnn.rnn.cells.0.act_f: avg_max: 0.9999987296174087 mean: 0.22353376293282823 std: 0.37551881742884263 + b: 0.3097487535536952 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.act_i: inputs: @@ -1491,6 +1659,7 @@ decoder.att_rnn.rnn.cells.0.act_i: avg_max: 17.757313247745003 mean: -2.6210182911726867 std: 6.095373006020184 + b: 4.84681096920807 shape: (1280, 1024) output: min: 3.5364804280239927e-19 @@ -1499,6 +1668,7 @@ decoder.att_rnn.rnn.cells.0.act_i: avg_max: 0.9999995308980515 mean: 0.330330826534649 std: 0.4106618580468271 + b: 0.373719283670522 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.act_o: inputs: @@ -1509,6 +1679,7 @@ decoder.att_rnn.rnn.cells.0.act_o: avg_max: 22.38339533752272 mean: -1.0162639051926945 std: 6.201834056672583 + b: 4.915137376678117 shape: (1280, 1024) output: min: 1.5979450257881012e-17 @@ -1517,6 +1688,7 @@ decoder.att_rnn.rnn.cells.0.act_o: avg_max: 0.9999999956133669 mean: 0.4198552611838568 std: 0.4295806405099903 + b: 0.4058276877309498 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.act_g: inputs: @@ -1527,6 +1699,7 @@ decoder.att_rnn.rnn.cells.0.act_g: avg_max: 25.075750009129578 mean: 0.023995243424084333 std: 7.12668924216872 + b: 5.679371305931813 shape: (1280, 1024) output: min: -1.0 @@ -1535,6 +1708,7 @@ decoder.att_rnn.rnn.cells.0.act_g: avg_max: 0.999999995680337 mean: -0.002128829708629359 std: 0.9478563806555594 + b: 0.9298617161056972 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.eltwisemult_cell_forget: inputs: @@ -1545,6 +1719,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_forget: avg_max: 0.9999987296174087 mean: 0.22353376293282823 std: 0.37551881742884263 + b: 0.3097487535536952 shape: (1280, 1024) 1: min: -45.964332580566406 @@ -1553,6 +1728,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_forget: avg_max: 30.634137733129954 mean: 0.06729932622641525 std: 1.6449935154656605 + b: 0.557591549851251 shape: (1280, 1024) output: min: -45.9624137878418 @@ -1561,6 +1737,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_forget: avg_max: 30.575408138686335 mean: 0.06604976536836245 std: 1.5455946584670446 + b: 0.2958711958458917 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.eltwisemult_cell_input: inputs: @@ -1571,6 +1748,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_input: avg_max: 0.9999995308980515 mean: 0.330330826534649 std: 0.4106618580468271 + b: 0.373719283670522 shape: (1280, 1024) 1: min: -1.0 @@ -1579,6 +1757,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_input: avg_max: 0.999999995680337 mean: -0.002128829708629359 std: 0.9478563806555594 + b: 0.9298617161056972 shape: (1280, 1024) output: min: -1.0 @@ -1587,6 +1766,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_cell_input: avg_max: 0.9999969378950895 mean: 0.0027081930132716804 std: 0.510135969194079 + b: 0.31562757913985934 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.eltwiseadd_cell: inputs: @@ -1597,6 +1777,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_cell: avg_max: 30.575408138686335 mean: 0.06604976536836245 std: 1.5455946584670446 + b: 0.2958711958458917 shape: (1280, 1024) 1: min: -1.0 @@ -1605,6 +1786,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_cell: avg_max: 0.9999969378950895 mean: 0.0027081930132716804 std: 0.510135969194079 + b: 0.31562757913985934 shape: (1280, 1024) output: min: -46.92375183105469 @@ -1613,6 +1795,7 @@ decoder.att_rnn.rnn.cells.0.eltwiseadd_cell: avg_max: 31.416855715031076 mean: 0.06875795865061662 std: 1.676732644640398 + b: 0.5653925294956461 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.act_h: inputs: @@ -1623,6 +1806,7 @@ decoder.att_rnn.rnn.cells.0.act_h: avg_max: 31.416855715031076 mean: 0.06875795865061662 std: 1.676732644640398 + b: 0.5653925294956461 shape: (1280, 1024) output: min: -1.0 @@ -1631,6 +1815,7 @@ decoder.att_rnn.rnn.cells.0.act_h: avg_max: 0.9959949824247473 mean: 0.0061799199947633445 std: 0.47477836896857967 + b: 0.3310784636421155 shape: (1280, 1024) decoder.att_rnn.rnn.cells.0.eltwisemult_hidden: inputs: @@ -1641,6 +1826,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_hidden: avg_max: 0.9999999956133669 mean: 0.4198552611838568 std: 0.4295806405099903 + b: 0.4058276877309498 shape: (1280, 1024) 1: min: -1.0 @@ -1649,6 +1835,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_hidden: avg_max: 0.9959949824247473 mean: 0.0061799199947633445 std: 0.47477836896857967 + b: 0.3310784636421155 shape: (1280, 1024) output: min: -1.0 @@ -1657,6 +1844,7 @@ decoder.att_rnn.rnn.cells.0.eltwisemult_hidden: avg_max: 0.9952872082423628 mean: 0.007966578123805529 std: 0.32187307320175484 + b: 0.16622043455835803 shape: (1280, 1024) decoder.att_rnn.rnn.dropout: output: @@ -1666,6 +1854,7 @@ decoder.att_rnn.rnn.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' decoder.att_rnn.attn.linear_q: inputs: @@ -1676,6 +1865,7 @@ decoder.att_rnn.attn.linear_q: avg_max: 0.9952872082423628 mean: 0.007966578123805529 std: 0.32187307320175484 + b: 0.16622043455835803 shape: (1280, 1, 1024) output: min: -28.324504852294922 @@ -1684,6 +1874,7 @@ decoder.att_rnn.attn.linear_q: avg_max: 16.370824370223502 mean: 0.07139282220419639 std: 5.614923564767698 + b: 4.3945239412650645 shape: (1280, 1, 1024) decoder.att_rnn.attn.linear_k: inputs: @@ -1694,6 +1885,7 @@ decoder.att_rnn.attn.linear_k: avg_max: 2.571398892563376 mean: -0.002407128676685295 std: 0.3183030856867703 + b: 0.17229868868428674 shape: (1280, 67, 1024) output: min: -27.933134078979492 @@ -1702,6 +1894,7 @@ decoder.att_rnn.attn.linear_k: avg_max: 21.645756855439604 mean: 0.11102438071386868 std: 4.714025089771699 + b: 3.3774262215314326 shape: (1280, 67, 1024) decoder.att_rnn.attn.dropout: inputs: @@ -1712,6 +1905,7 @@ decoder.att_rnn.attn.dropout: avg_max: 0.39005828052340624 mean: 0.013619308356258466 std: 0.05484342060778131 + b: 0.01973839232938762 shape: (1280, 1, 67) output: min: 0.0 @@ -1720,6 +1914,7 @@ decoder.att_rnn.attn.dropout: avg_max: 0.39005828052340624 mean: 0.013619308356258466 std: 0.05484342060778131 + b: 0.01973839232938762 shape: (1280, 1, 67) decoder.att_rnn.attn.eltwiseadd_qk: inputs: @@ -1730,6 +1925,7 @@ decoder.att_rnn.attn.eltwiseadd_qk: avg_max: 16.370824370223502 mean: 0.07139282138405703 std: 5.614948532126775 + b: 4.394523893313473 shape: (1280, 1, 67, 1024) 1: min: -27.933134078979492 @@ -1738,6 +1934,7 @@ decoder.att_rnn.attn.eltwiseadd_qk: avg_max: 21.645756855439604 mean: 0.11102438071386868 std: 4.714025089771699 + b: 3.3774262215314326 shape: (1280, 1, 67, 1024) output: min: -42.6893310546875 @@ -1746,6 +1943,7 @@ decoder.att_rnn.attn.eltwiseadd_qk: avg_max: 26.724065986376143 mean: 0.18241720259650032 std: 6.144074303575077 + b: 4.8617969220943795 shape: (1280, 1, 67, 1024) decoder.att_rnn.attn.eltwiseadd_norm_bias: inputs: @@ -1756,6 +1954,7 @@ decoder.att_rnn.attn.eltwiseadd_norm_bias: avg_max: 26.724065986376143 mean: 0.18241720259650032 std: 6.144074303575077 + b: 4.8617969220943795 shape: (1280, 1, 67, 1024) 1: min: -1.159269094467163 @@ -1764,6 +1963,7 @@ decoder.att_rnn.attn.eltwiseadd_norm_bias: avg_max: 1.030885934829712 mean: 0.01333538442850113 std: 0.3476333932113675 + b: 0.2746787667274475 shape: (1024) output: min: -42.8883171081543 @@ -1772,6 +1972,7 @@ decoder.att_rnn.attn.eltwiseadd_norm_bias: avg_max: 26.855665168333605 mean: 0.19575258549512087 std: 6.287847745163477 + b: 5.012542623348454 shape: (1280, 1, 67, 1024) decoder.att_rnn.attn.eltwisemul_norm_scaler: inputs: @@ -1782,6 +1983,7 @@ decoder.att_rnn.attn.eltwisemul_norm_scaler: avg_max: 0.09800389409065247 mean: 0.00032652184017933905 std: 0.031248209015367 + b: 0.027989715337753296 shape: (1024) 1: min: 1.2628638744354248 @@ -1790,6 +1992,7 @@ decoder.att_rnn.attn.eltwisemul_norm_scaler: avg_max: 1.2628638744354248 mean: 1.2628638744354248 std: .nan + b: 0.0 shape: (1) output: min: -0.2077881544828415 @@ -1798,6 +2001,7 @@ decoder.att_rnn.attn.eltwisemul_norm_scaler: avg_max: 0.12376558035612106 mean: 0.0004123526159673929 std: 0.03946245597745369 + b: 0.03534720093011856 shape: (1024) decoder.att_rnn.attn.tanh: inputs: @@ -1808,6 +2012,7 @@ decoder.att_rnn.attn.tanh: avg_max: 26.855665168333605 mean: 0.19575258549512087 std: 6.287847745163477 + b: 5.012542623348454 shape: (1280, 1, 67, 1024) output: min: -1.0 @@ -1816,6 +2021,7 @@ decoder.att_rnn.attn.tanh: avg_max: 0.999999995680337 mean: 0.03299323406470721 std: 0.9384829454472874 + b: 0.9170371813385669 shape: (1280, 1, 67, 1024) decoder.att_rnn.attn.matmul_score: inputs: @@ -1826,6 +2032,7 @@ decoder.att_rnn.attn.matmul_score: avg_max: 0.999999995680337 mean: 0.03299323406470721 std: 0.9384829454472874 + b: 0.9170371813385669 shape: (1280, 1, 67, 1024) 1: min: -0.2077881544828415 @@ -1834,6 +2041,7 @@ decoder.att_rnn.attn.matmul_score: avg_max: 0.12376558035612106 mean: 0.0004123526159673929 std: 0.03946245597745369 + b: 0.03534720093011856 shape: (1024) output: min: -5.075064182281494 @@ -1842,6 +2050,7 @@ decoder.att_rnn.attn.matmul_score: avg_max: 12.197041146406967 mean: 7.974570217477471 std: 2.9091675582996563 + b: 2.208857783708679 shape: (1280, 1, 67) decoder.att_rnn.attn.softmax_att: inputs: @@ -1852,6 +2061,7 @@ decoder.att_rnn.attn.softmax_att: avg_max: 12.163362407550382 mean: -24693.233116322437 std: 31749.93291331495 + b: 30775.105422665023 shape: (1280, 1, 67) output: min: 0.0 @@ -1860,6 +2070,7 @@ decoder.att_rnn.attn.softmax_att: avg_max: 0.39005828052340624 mean: 0.013619308356258466 std: 0.05484342060778131 + b: 0.01973839232938762 shape: (1280, 1, 67) decoder.att_rnn.attn.context_matmul: inputs: @@ -1870,6 +2081,7 @@ decoder.att_rnn.attn.context_matmul: avg_max: 0.39005828052340624 mean: 0.013619308356258466 std: 0.05484342060778131 + b: 0.01973839232938762 shape: (1280, 1, 67) 1: min: -2.9931235313415527 @@ -1878,6 +2090,7 @@ decoder.att_rnn.attn.context_matmul: avg_max: 2.571398892563376 mean: -0.002407128676685295 std: 0.3183030856867703 + b: 0.17229868868428674 shape: (1280, 67, 1024) output: min: -2.748584032058716 @@ -1886,6 +2099,7 @@ decoder.att_rnn.attn.context_matmul: avg_max: 0.9287193868937127 mean: 0.0004324376445557447 std: 0.16600608798290178 + b: 0.09215169849313716 shape: (1280, 1, 1024) decoder.att_rnn.dropout: inputs: @@ -1896,6 +2110,7 @@ decoder.att_rnn.dropout: avg_max: 0.9952872082423628 mean: 0.007966578123805529 std: 0.32187307320175484 + b: 0.16622043455835803 shape: (1280, 1, 1024) output: min: -1.0 @@ -1904,6 +2119,7 @@ decoder.att_rnn.dropout: avg_max: 0.9952872082423628 mean: 0.007966578123805529 std: 0.32187307320175484 + b: 0.16622043455835803 shape: (1280, 1, 1024) decoder.rnn_layers.0.cells.0.fc_gate_x: inputs: @@ -1914,6 +2130,7 @@ decoder.rnn_layers.0.cells.0.fc_gate_x: avg_max: 1.1144092399752523 mean: 0.004199507878490362 std: 0.2561140031003674 + b: 0.12828497891262 shape: (1280, 2048) output: min: -31.11646270751953 @@ -1922,6 +2139,7 @@ decoder.rnn_layers.0.cells.0.fc_gate_x: avg_max: 10.85241473739067 mean: -1.1740948434960983 std: 3.0091560830798434 + b: 2.3389595296945487 shape: (1280, 4096) decoder.rnn_layers.0.cells.0.fc_gate_h: inputs: @@ -1932,6 +2150,7 @@ decoder.rnn_layers.0.cells.0.fc_gate_h: avg_max: 0.7587759274110363 mean: -0.0010008882399605407 std: 0.1321120655297924 + b: 0.05292223636950414 shape: (1280, 1024) output: min: -16.805160522460938 @@ -1940,6 +2159,7 @@ decoder.rnn_layers.0.cells.0.fc_gate_h: avg_max: 5.603913084911495 mean: -0.49282410457060527 std: 1.2749332466602463 + b: 0.9300034851840387 shape: (1280, 4096) decoder.rnn_layers.0.cells.0.eltwiseadd_gate: inputs: @@ -1950,6 +2170,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 10.85241473739067 mean: -1.1740948434960983 std: 3.0091560830798434 + b: 2.3389595296945487 shape: (1280, 4096) 1: min: -16.805160522460938 @@ -1958,6 +2179,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 5.603913084911495 mean: -0.49282410457060527 std: 1.2749332466602463 + b: 0.9300034851840387 shape: (1280, 4096) output: min: -33.57273864746094 @@ -1966,6 +2188,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_gate: avg_max: 12.499767167380655 mean: -1.6669189450446138 std: 3.6505088399213936 + b: 2.8570796875471483 shape: (1280, 4096) decoder.rnn_layers.0.cells.0.act_f: inputs: @@ -1976,6 +2199,7 @@ decoder.rnn_layers.0.cells.0.act_f: avg_max: 11.013900484127936 mean: -2.6897614965278116 std: 4.286025676313951 + b: 3.335799002647403 shape: (1280, 1024) output: min: 3.4783092882741465e-14 @@ -1984,6 +2208,7 @@ decoder.rnn_layers.0.cells.0.act_f: avg_max: 0.9996802927737837 mean: 0.2552954440455087 std: 0.35829928323219956 + b: 0.30173072499338094 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.act_i: inputs: @@ -1994,6 +2219,7 @@ decoder.rnn_layers.0.cells.0.act_i: avg_max: 9.380031768927397 mean: -2.4518663697698173 std: 3.0217416033646693 + b: 2.323332796873674 shape: (1280, 1024) output: min: 1.005313792824293e-13 @@ -2002,6 +2228,7 @@ decoder.rnn_layers.0.cells.0.act_i: avg_max: 0.9996398801214232 mean: 0.22650649123144959 std: 0.3026139789025641 + b: 0.2427336303287007 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.act_o: inputs: @@ -2012,6 +2239,7 @@ decoder.rnn_layers.0.cells.0.act_o: avg_max: 8.524241487095862 mean: -1.5604653776026836 std: 3.2415928306767676 + b: 2.538264309288409 shape: (1280, 1024) output: min: 1.3685174955063717e-12 @@ -2020,6 +2248,7 @@ decoder.rnn_layers.0.cells.0.act_o: avg_max: 0.9978680859455905 mean: 0.3295869104145614 std: 0.34560961994352346 + b: 0.30255046387234413 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.act_g: inputs: @@ -2030,6 +2259,7 @@ decoder.rnn_layers.0.cells.0.act_g: avg_max: 11.46917405824984 mean: 0.03441744941392035 std: 3.2752884250043954 + b: 2.518169113558331 shape: (1280, 1024) output: min: -1.0 @@ -2038,6 +2268,7 @@ decoder.rnn_layers.0.cells.0.act_g: avg_max: 0.9999999514456561 mean: 0.009212502904665593 std: 0.8613827573443271 + b: 0.8191404633977437 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: inputs: @@ -2048,6 +2279,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 0.9996802927737837 mean: 0.2552954440455087 std: 0.35829928323219956 + b: 0.30173072499338094 shape: (1280, 1024) 1: min: -50.77605438232422 @@ -2056,6 +2288,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 11.783769438574792 mean: -0.013233005671694572 std: 0.9262749365617192 + b: 0.3140615258407732 shape: (1280, 1024) output: min: -50.32355880737305 @@ -2064,6 +2297,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_forget: avg_max: 11.075782726721808 mean: -0.010043481503626429 std: 0.8084092966416248 + b: 0.13408823390229707 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.eltwisemult_cell_input: inputs: @@ -2074,6 +2308,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9996398801214232 mean: 0.22650649123144959 std: 0.3026139789025641 + b: 0.2427336303287007 shape: (1280, 1024) 1: min: -1.0 @@ -2082,6 +2317,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9999999514456561 mean: 0.009212502904665593 std: 0.8613827573443271 + b: 0.8191404633977437 shape: (1280, 1024) output: min: -1.0 @@ -2090,6 +2326,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_cell_input: avg_max: 0.9981463519374972 mean: -0.0036360137033707263 std: 0.3448696757960219 + b: 0.19587422205825833 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.eltwiseadd_cell: inputs: @@ -2100,6 +2337,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 11.075782726721808 mean: -0.010043481503626429 std: 0.8084092966416248 + b: 0.13408823390229707 shape: (1280, 1024) 1: min: -1.0 @@ -2108,6 +2346,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 0.9981463519374972 mean: -0.0036360137033707263 std: 0.3448696757960219 + b: 0.19587422205825833 shape: (1280, 1024) output: min: -51.32215881347656 @@ -2116,6 +2355,7 @@ decoder.rnn_layers.0.cells.0.eltwiseadd_cell: avg_max: 11.741759218124862 mean: -0.013679495197402003 std: 0.9252576031128981 + b: 0.31603610831197754 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.act_h: inputs: @@ -2126,6 +2366,7 @@ decoder.rnn_layers.0.cells.0.act_h: avg_max: 11.741759218124862 mean: -0.013679495197402003 std: 0.9252576031128981 + b: 0.31603610831197754 shape: (1280, 1024) output: min: -1.0 @@ -2134,6 +2375,7 @@ decoder.rnn_layers.0.cells.0.act_h: avg_max: 0.9928096467189585 mean: -0.005522266373139992 std: 0.3470194696164207 + b: 0.2231224861121579 shape: (1280, 1024) decoder.rnn_layers.0.cells.0.eltwisemult_hidden: inputs: @@ -2144,6 +2386,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.9978680859455905 mean: 0.3295869104145614 std: 0.34560961994352346 + b: 0.30255046387234413 shape: (1280, 1024) 1: min: -1.0 @@ -2152,6 +2395,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.9928096467189585 mean: -0.005522266373139992 std: 0.3470194696164207 + b: 0.2231224861121579 shape: (1280, 1024) output: min: -0.9999914765357971 @@ -2160,6 +2404,7 @@ decoder.rnn_layers.0.cells.0.eltwisemult_hidden: avg_max: 0.7594633606377601 mean: -0.0010860526439966734 std: 0.13179261237432402 + b: 0.05422022580535392 shape: (1280, 1024) decoder.rnn_layers.0.dropout: output: @@ -2169,6 +2414,7 @@ decoder.rnn_layers.0.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' decoder.rnn_layers.1.cells.0.fc_gate_x: inputs: @@ -2179,6 +2425,7 @@ decoder.rnn_layers.1.cells.0.fc_gate_x: avg_max: 0.996185907755004 mean: -0.00032680749604648707 std: 0.14988080842341334 + b: 0.07311449136082718 shape: (1280, 2048) output: min: -22.17432403564453 @@ -2187,6 +2434,7 @@ decoder.rnn_layers.1.cells.0.fc_gate_x: avg_max: 7.5104381103194156 mean: -0.8090552802835963 std: 2.096109569488443 + b: 1.5889325456338008 shape: (1280, 4096) decoder.rnn_layers.1.cells.0.fc_gate_h: inputs: @@ -2197,6 +2445,7 @@ decoder.rnn_layers.1.cells.0.fc_gate_h: avg_max: 0.7355864281399854 mean: 0.00030723568868500604 std: 0.13119853605881615 + b: 0.06826835323013587 shape: (1280, 1024) output: min: -10.816532135009766 @@ -2205,6 +2454,7 @@ decoder.rnn_layers.1.cells.0.fc_gate_h: avg_max: 4.048953114064886 mean: -0.47011378199029547 std: 0.9846700195969109 + b: 0.7533953878651847 shape: (1280, 4096) decoder.rnn_layers.1.cells.0.eltwiseadd_gate: inputs: @@ -2215,6 +2465,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 7.5104381103194156 mean: -0.8090552802835963 std: 2.096109569488443 + b: 1.5889325456338008 shape: (1280, 4096) 1: min: -10.816532135009766 @@ -2223,6 +2474,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 4.048953114064886 mean: -0.47011378199029547 std: 0.9846700195969109 + b: 0.7533953878651847 shape: (1280, 4096) output: min: -24.22309684753418 @@ -2231,6 +2483,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_gate: avg_max: 8.619773781969311 mean: -1.2791690607754058 std: 2.5821871111713826 + b: 2.007004778170852 shape: (1280, 4096) decoder.rnn_layers.1.cells.0.act_f: inputs: @@ -2241,6 +2494,7 @@ decoder.rnn_layers.1.cells.0.act_f: avg_max: 6.821592486440458 mean: -2.6714931534247466 std: 2.5443121998813525 + b: 1.9007043993205144 shape: (1280, 1024) output: min: 3.020248634522105e-11 @@ -2249,6 +2503,7 @@ decoder.rnn_layers.1.cells.0.act_f: avg_max: 0.9935724002256816 mean: 0.1752624285941042 std: 0.25292423787376883 + b: 0.18595207231014632 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.act_i: inputs: @@ -2259,6 +2514,7 @@ decoder.rnn_layers.1.cells.0.act_i: avg_max: 6.599652247616419 mean: -1.3425940824358642 std: 2.214069210841872 + b: 1.694840380080627 shape: (1280, 1024) output: min: 1.6614134512593637e-09 @@ -2267,6 +2523,7 @@ decoder.rnn_layers.1.cells.0.act_i: avg_max: 0.994309761283102 mean: 0.3069548572800805 std: 0.29030061738546137 + b: 0.24375973142097485 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.act_o: inputs: @@ -2277,6 +2534,7 @@ decoder.rnn_layers.1.cells.0.act_o: avg_max: 7.602936600299369 mean: -1.092906504534604 std: 2.4894375831577364 + b: 1.934165755215657 shape: (1280, 1024) output: min: 1.0446175036094019e-09 @@ -2285,6 +2543,7 @@ decoder.rnn_layers.1.cells.0.act_o: avg_max: 0.9977307225211285 mean: 0.3514365402500282 std: 0.32005181986355885 + b: 0.2781929248169561 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.act_g: inputs: @@ -2295,6 +2554,7 @@ decoder.rnn_layers.1.cells.0.act_g: avg_max: 7.799964112378239 mean: -0.009682508857590904 std: 2.3471144877832884 + b: 1.7941640214303936 shape: (1280, 1024) output: min: -1.0 @@ -2303,6 +2563,7 @@ decoder.rnn_layers.1.cells.0.act_g: avg_max: 0.9999866421637916 mean: -0.007454321779905927 std: 0.8034238800947899 + b: 0.7480113414231311 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: inputs: @@ -2313,6 +2574,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 0.9935724002256816 mean: 0.1752624285941042 std: 0.25292423787376883 + b: 0.18595207231014632 shape: (1280, 1024) 1: min: -50.93753433227539 @@ -2321,6 +2583,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 2.858599665124761 mean: -0.008311334580423152 std: 0.4794455076286656 + b: 0.28920617523822856 shape: (1280, 1024) output: min: -50.889991760253906 @@ -2329,6 +2592,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_forget: avg_max: 2.3347478714216994 mean: -0.0015736418809647336 std: 0.25571878953430177 + b: 0.0655673016457859 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.eltwisemult_cell_input: inputs: @@ -2339,6 +2603,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.994309761283102 mean: 0.3069548572800805 std: 0.29030061738546137 + b: 0.24375973142097485 shape: (1280, 1024) 1: min: -1.0 @@ -2347,6 +2612,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.9999866421637916 mean: -0.007454321779905927 std: 0.8034238800947899 + b: 0.7480113414231311 shape: (1280, 1024) output: min: -1.0 @@ -2355,6 +2621,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_cell_input: avg_max: 0.9891012119108362 mean: -0.006563609112533941 std: 0.3551230793477708 + b: 0.23654065306267028 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.eltwiseadd_cell: inputs: @@ -2365,6 +2632,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 2.3347478714216994 mean: -0.0015736418809647336 std: 0.25571878953430177 + b: 0.0655673016457859 shape: (1280, 1024) 1: min: -1.0 @@ -2373,6 +2641,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 0.9891012119108362 mean: -0.006563609112533941 std: 0.3551230793477708 + b: 0.23654065306267028 shape: (1280, 1024) output: min: -51.780860900878906 @@ -2381,6 +2650,7 @@ decoder.rnn_layers.1.cells.0.eltwiseadd_cell: avg_max: 2.8984877801343285 mean: -0.008137250974908283 std: 0.4813941324820594 + b: 0.29027094568094497 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.act_h: inputs: @@ -2391,6 +2661,7 @@ decoder.rnn_layers.1.cells.0.act_h: avg_max: 2.8984877801343285 mean: -0.008137250974908283 std: 0.4813941324820594 + b: 0.29027094568094497 shape: (1280, 1024) output: min: -1.0 @@ -2399,6 +2670,7 @@ decoder.rnn_layers.1.cells.0.act_h: avg_max: 0.9670456421509231 mean: -0.0057490570247533766 std: 0.345665687214933 + b: 0.2453265073677798 shape: (1280, 1024) decoder.rnn_layers.1.cells.0.eltwisemult_hidden: inputs: @@ -2409,6 +2681,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.9977307225211285 mean: 0.3514365402500282 std: 0.32005181986355885 + b: 0.2781929248169561 shape: (1280, 1024) 1: min: -1.0 @@ -2417,6 +2690,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.9670456421509231 mean: -0.0057490570247533766 std: 0.345665687214933 + b: 0.2453265073677798 shape: (1280, 1024) output: min: -0.9998709559440613 @@ -2425,6 +2699,7 @@ decoder.rnn_layers.1.cells.0.eltwisemult_hidden: avg_max: 0.743544528524527 mean: 0.00032440792050908126 std: 0.1314242965426167 + b: 0.069381494656875 shape: (1280, 1024) decoder.rnn_layers.1.dropout: output: @@ -2434,6 +2709,7 @@ decoder.rnn_layers.1.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' decoder.rnn_layers.2.cells.0.fc_gate_x: inputs: @@ -2444,6 +2720,7 @@ decoder.rnn_layers.2.cells.0.fc_gate_x: avg_max: 1.1189356212870456 mean: -0.00016460354038312733 std: 0.1760178086847606 + b: 0.09911453349494868 shape: (1280, 2048) output: min: -24.67524528503418 @@ -2452,6 +2729,7 @@ decoder.rnn_layers.2.cells.0.fc_gate_x: avg_max: 8.328183968147563 mean: -0.8204240057408129 std: 2.453284969094989 + b: 1.8982749567942672 shape: (1280, 4096) decoder.rnn_layers.2.cells.0.fc_gate_h: inputs: @@ -2462,6 +2740,7 @@ decoder.rnn_layers.2.cells.0.fc_gate_h: avg_max: 0.8058140833056374 mean: -0.0030742165989599258 std: 0.16821435780985747 + b: 0.08427371087201521 shape: (1280, 1024) output: min: -18.816926956176758 @@ -2470,6 +2749,7 @@ decoder.rnn_layers.2.cells.0.fc_gate_h: avg_max: 5.89386603040307 mean: -0.7366783961109576 std: 1.5059158176081058 + b: 1.1217902419607277 shape: (1280, 4096) decoder.rnn_layers.2.cells.0.eltwiseadd_gate: inputs: @@ -2480,6 +2760,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 8.328183968147563 mean: -0.8204240057408129 std: 2.453284969094989 + b: 1.8982749567942672 shape: (1280, 4096) 1: min: -18.816926956176758 @@ -2488,6 +2769,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 5.89386603040307 mean: -0.7366783961109576 std: 1.5059158176081058 + b: 1.1217902419607277 shape: (1280, 4096) output: min: -29.283180236816406 @@ -2496,6 +2778,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_gate: avg_max: 10.381155111280712 mean: -1.557102406895562 std: 3.2988025400040257 + b: 2.5843769489379405 shape: (1280, 4096) decoder.rnn_layers.2.cells.0.act_f: inputs: @@ -2506,6 +2789,7 @@ decoder.rnn_layers.2.cells.0.act_f: avg_max: 7.526722565527717 mean: -3.6089404096094384 std: 3.0091061671078934 + b: 2.2975884913058753 shape: (1280, 1024) output: min: 1.9163568843773293e-13 @@ -2514,6 +2798,7 @@ decoder.rnn_layers.2.cells.0.act_f: avg_max: 0.9962542612231184 mean: 0.13819005828811198 std: 0.24204215179724617 + b: 0.16834504561645278 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.act_i: inputs: @@ -2524,6 +2809,7 @@ decoder.rnn_layers.2.cells.0.act_i: avg_max: 7.826490100581997 mean: -0.91120249788078 std: 2.6741725568824375 + b: 2.0841160098488456 shape: (1280, 1024) output: min: 3.0161959735375277e-11 @@ -2532,6 +2818,7 @@ decoder.rnn_layers.2.cells.0.act_i: avg_max: 0.9983030597815357 mean: 0.38171999553281244 std: 0.3367532650457234 + b: 0.29790418672092767 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.act_o: inputs: @@ -2542,6 +2829,7 @@ decoder.rnn_layers.2.cells.0.act_o: avg_max: 8.086509210607966 mean: -1.6018465338798051 std: 2.822778696064494 + b: 2.2109367526649093 shape: (1280, 1024) output: min: 7.879931950005581e-12 @@ -2550,6 +2838,7 @@ decoder.rnn_layers.2.cells.0.act_o: avg_max: 0.9988285692890035 mean: 0.30748807730969463 std: 0.3263006812590941 + b: 0.280720940274134 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.act_g: inputs: @@ -2560,6 +2849,7 @@ decoder.rnn_layers.2.cells.0.act_g: avg_max: 10.15937283199823 mean: -0.10642016830597752 std: 3.5525409631159355 + b: 2.81734556696388 shape: (1280, 1024) output: min: -1.0 @@ -2568,6 +2858,7 @@ decoder.rnn_layers.2.cells.0.act_g: avg_max: 0.999999153747988 mean: -0.0269237342450161 std: 0.879016886211502 + b: 0.8402174213294229 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: inputs: @@ -2578,6 +2869,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 0.9962542612231184 mean: 0.13819005828811198 std: 0.24204215179724617 + b: 0.16834504561645278 shape: (1280, 1024) 1: min: -50.77392578125 @@ -2586,6 +2878,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 3.778732706154328 mean: -0.010375739289892385 std: 0.6429531313365376 + b: 0.39789132222146045 shape: (1280, 1024) output: min: -49.94257354736328 @@ -2594,6 +2887,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_forget: avg_max: 3.1268611868780627 mean: -0.00047371524729856 std: 0.358745687372424 + b: 0.07846207401772116 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.eltwisemult_cell_input: inputs: @@ -2604,6 +2898,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.9983030597815357 mean: 0.38171999553281244 std: 0.3367532650457234 + b: 0.29790418672092767 shape: (1280, 1024) 1: min: -1.0 @@ -2612,6 +2907,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.999999153747988 mean: -0.0269237342450161 std: 0.879016886211502 + b: 0.8402174213294229 shape: (1280, 1024) output: min: -1.0 @@ -2620,6 +2916,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_cell_input: avg_max: 0.9960211656066814 mean: -0.01004450515169632 std: 0.4632697182775279 + b: 0.3311674565244253 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.eltwiseadd_cell: inputs: @@ -2630,6 +2927,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 3.1268611868780627 mean: -0.00047371524729856 std: 0.358745687372424 + b: 0.07846207401772116 shape: (1280, 1024) 1: min: -1.0 @@ -2638,6 +2936,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 0.9960211656066814 mean: -0.01004450515169632 std: 0.4632697182775279 + b: 0.3311674565244253 shape: (1280, 1024) output: min: -50.93840026855469 @@ -2646,6 +2945,7 @@ decoder.rnn_layers.2.cells.0.eltwiseadd_cell: avg_max: 3.8555672365293057 mean: -0.010518220426035512 std: 0.6487821329571182 + b: 0.4016652262947534 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.act_h: inputs: @@ -2656,6 +2956,7 @@ decoder.rnn_layers.2.cells.0.act_h: avg_max: 3.8555672365293057 mean: -0.010518220426035512 std: 0.6487821329571182 + b: 0.4016652262947534 shape: (1280, 1024) output: min: -1.0 @@ -2664,6 +2965,7 @@ decoder.rnn_layers.2.cells.0.act_h: avg_max: 0.9829632449016149 mean: -0.008678492148203425 std: 0.4269098495511477 + b: 0.32061298486222056 shape: (1280, 1024) decoder.rnn_layers.2.cells.0.eltwisemult_hidden: inputs: @@ -2674,6 +2976,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.9988285692890035 mean: 0.30748807730969463 std: 0.3263006812590941 + b: 0.280720940274134 shape: (1280, 1024) 1: min: -1.0 @@ -2682,6 +2985,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.9829632449016149 mean: -0.008678492148203425 std: 0.4269098495511477 + b: 0.32061298486222056 shape: (1280, 1024) output: min: -0.9999999403953552 @@ -2690,6 +2994,7 @@ decoder.rnn_layers.2.cells.0.eltwisemult_hidden: avg_max: 0.8186234360665419 mean: -0.0031441434696200784 std: 0.1694763855118817 + b: 0.08618559386330027 shape: (1280, 1024) decoder.rnn_layers.2.dropout: output: @@ -2699,6 +3004,7 @@ decoder.rnn_layers.2.dropout: avg_max: 0 mean: 0 std: 0 + b: 0 shape: '' decoder.classifier.classifier: inputs: @@ -2709,6 +3015,7 @@ decoder.classifier.classifier: avg_max: 1.2828886617770356 mean: -0.003905788197230935 std: 0.2593071699498008 + b: 0.1599949207617327 shape: (1280, 1, 1024) output: min: -19.020658493041992 @@ -2717,6 +3024,7 @@ decoder.classifier.classifier: avg_max: 8.032180467616302 mean: -3.873352058817827 std: 1.696852165054472 + b: 1.3471277234594476 shape: (1280, 1, 32317) decoder.dropout: inputs: @@ -2727,6 +3035,7 @@ decoder.dropout: avg_max: 0.9094174789020626 mean: 0.002039626916454153 std: 0.22761719307769354 + b: 0.10798135079575982 shape: (1280, 1, 1024) output: min: -1.9940483570098877 @@ -2735,6 +3044,7 @@ decoder.dropout: avg_max: 0.9094174789020626 mean: 0.002039626916454153 std: 0.22761719307769354 + b: 0.10798135079575982 shape: (1280, 1, 1024) decoder.eltwiseadd_residuals.0: inputs: @@ -2745,6 +3055,7 @@ decoder.eltwiseadd_residuals.0: avg_max: 0.743544528524527 mean: 0.00032440792050908126 std: 0.1314242965426167 + b: 0.069381494656875 shape: (1280, 1, 1024) 1: min: -0.9999914765357971 @@ -2753,6 +3064,7 @@ decoder.eltwiseadd_residuals.0: avg_max: 0.7594633606377601 mean: -0.0010860526439966734 std: 0.13179261237432402 + b: 0.05422022580535392 shape: (1280, 1, 1024) output: min: -1.9940483570098877 @@ -2761,6 +3073,7 @@ decoder.eltwiseadd_residuals.0: avg_max: 0.9735018678260661 mean: -0.0007616447304464125 std: 0.1854876875589218 + b: 0.10607970475815659 shape: (1280, 1, 1024) decoder.eltwiseadd_residuals.1: inputs: @@ -2771,6 +3084,7 @@ decoder.eltwiseadd_residuals.1: avg_max: 0.8186234360665419 mean: -0.0031441434696200784 std: 0.1694763855118817 + b: 0.08618559386330027 shape: (1280, 1, 1024) 1: min: -1.9940483570098877 @@ -2779,6 +3093,7 @@ decoder.eltwiseadd_residuals.1: avg_max: 0.9735018678260661 mean: -0.0007616447304464125 std: 0.1854876875589218 + b: 0.10607970475815659 shape: (1280, 1, 1024) output: min: -2.9619157314300537 @@ -2787,6 +3102,7 @@ decoder.eltwiseadd_residuals.1: avg_max: 1.2828886617770356 mean: -0.003905788197230935 std: 0.2593071699498008 + b: 0.1599949207617327 shape: (1280, 1, 1024) decoder.attention_concats.0: inputs: @@ -2797,6 +3113,7 @@ decoder.attention_concats.0: avg_max: 0.9952872082423628 mean: 0.007966578123805529 std: 0.32187307320175484 + b: 0.16622043455835803 shape: (1280, 1, 1024) 1: min: -2.748584032058716 @@ -2805,6 +3122,7 @@ decoder.attention_concats.0: avg_max: 0.9287193868937127 mean: 0.0004324376445557447 std: 0.16600608798290178 + b: 0.09215169849313716 shape: (1280, 1, 1024) output: min: -2.748584032058716 @@ -2813,6 +3131,7 @@ decoder.attention_concats.0: avg_max: 1.1144092399752523 mean: 0.004199507878490362 std: 0.2561140031003674 + b: 0.12828497891262 shape: (1280, 1, 2048) decoder.attention_concats.1: inputs: @@ -2823,6 +3142,7 @@ decoder.attention_concats.1: avg_max: 0.7594633606377601 mean: -0.0010860526439966734 std: 0.13179261237432402 + b: 0.05422022580535392 shape: (1280, 1, 1024) 1: min: -2.748584032058716 @@ -2831,6 +3151,7 @@ decoder.attention_concats.1: avg_max: 0.9287193868937127 mean: 0.0004324376445557447 std: 0.16600608798290178 + b: 0.09215169849313716 shape: (1280, 1, 1024) output: min: -2.748584032058716 @@ -2839,6 +3160,7 @@ decoder.attention_concats.1: avg_max: 0.996185907755004 mean: -0.00032680749604648707 std: 0.14988080842341334 + b: 0.07311449136082718 shape: (1280, 1, 2048) decoder.attention_concats.2: inputs: @@ -2849,6 +3171,7 @@ decoder.attention_concats.2: avg_max: 0.9735018678260661 mean: -0.0007616447304464125 std: 0.1854876875589218 + b: 0.10607970475815659 shape: (1280, 1, 1024) 1: min: -2.748584032058716 @@ -2857,6 +3180,7 @@ decoder.attention_concats.2: avg_max: 0.9287193868937127 mean: 0.0004324376445557447 std: 0.16600608798290178 + b: 0.09215169849313716 shape: (1280, 1, 1024) output: min: -2.748584032058716 @@ -2865,4 +3189,5 @@ decoder.attention_concats.2: avg_max: 1.1189356212870456 mean: -0.00016460354038312733 std: 0.1760178086847606 + b: 0.09911453349494868 shape: (1280, 1, 2048) diff --git a/examples/GNMT/quantize_gnmt.ipynb b/examples/GNMT/quantize_gnmt.ipynb index a2904d8..315ee1e 100644 --- a/examples/GNMT/quantize_gnmt.ipynb +++ b/examples/GNMT/quantize_gnmt.ipynb @@ -125,7 +125,11 @@ "from itertools import takewhile\n", "from tqdm import tqdm\n", "import logging\n", - "logging.disable(logging.INFO) # Disables mlperf output" + "logging.disable(logging.INFO) # Disables mlperf output\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(action='default', module='distiller.quantization')\n", + "warnings.filterwarnings(action='default', module='distiller.quantization.range_linear')" ] }, { @@ -293,13 +297,19 @@ "metadata": {}, "outputs": [], "source": [ - "def evaluate(model, test_path):\n", + "def evaluate(model, test_path, num_batches=None):\n", " test_file = open(test_path, 'w', encoding='UTF-8')\n", " model.eval()\n", " translator = get_translator(model)\n", " stats = {}\n", - " iterations = enumerate(tqdm(get_loader()))\n", - " for i, (src, tgt, indices) in iterations:\n", + " loader = get_loader()\n", + " total_batches = len(loader)\n", + " if num_batches is None:\n", + " num_batches = total_batches\n", + " num_batches = min(num_batches, total_batches)\n", + " loader = iter(loader)\n", + " for i in tqdm(range(num_batches)):\n", + " src, tgt, indices = next(loader)\n", " src, src_length = src\n", " if translator.batch_first:\n", " batch_size = src.size(0)\n", @@ -341,7 +351,11 @@ " test_file.write('\\n')\n", " total_tokens = stats['total_dec_len'] + stats['total_enc_len']\n", " test_file.close()\n", + " if num_batches < total_batches:\n", + " print(\"Can't calculate BLEU when evaluating partial dataset\")\n", + " return\n", " # run moses detokenizer\n", + " print(\"Calculating BLEU score...\")\n", " detok_path = os.path.join(dataset_dir, config.DETOKENIZER)\n", " detok_test_path = test_path + '.detok'\n", "\n", @@ -368,13 +382,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 24/24 [00:54<00:00, 2.03s/it]\n" + "100%|██████████| 24/24 [00:48<00:00, 1.77s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Calculating BLEU score...\n", "BLEU on test dataset: 22.16\n" ] } @@ -409,13 +424,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 24/24 [02:21<00:00, 5.05s/it]\n" + "100%|██████████| 24/24 [01:54<00:00, 4.02s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Calculating BLEU score...\n", "BLEU on test dataset: 22.16\n" ] } @@ -449,30 +465,60 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 24/24 [58:48<00:00, 130.39s/it]\n" + "100%|██████████| 24/24 [46:47<00:00, 102.09s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculating BLEU score...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/24 [00:00<?, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BLEU on test dataset: 22.16\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 24/24 [12:23<00:00, 26.94s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Calculating BLEU score...\n", "BLEU on test dataset: 22.16\n" ] } ], "source": [ "import os\n", - "from distiller.data_loggers import QuantCalibrationStatsCollector, collector_context\n", + "from distiller.data_loggers import collect_quant_stats\n", "\n", - "stats_file = './model_stats.yaml'\n", + "stats_file = './acts_quantization_stats.yaml'\n", "\n", "if not os.path.isfile(stats_file): # Collect stats.\n", " model_copy = deepcopy(model)\n", " distiller.utils.assign_layer_fq_names(model_copy)\n", - " collector = QuantCalibrationStatsCollector(model_copy)\n", - " with collector_context(collector):\n", - " val_loss = evaluate(model_copy, output + '.temp')\n", - " collector.save(stats_file)\n", + " \n", + " def eval_for_stats(model):\n", + " evaluate(model, output + '.temp', num_batches=None)\n", + " collect_quant_stats(model_copy, eval_for_stats, save_dir='.')\n", " del model_copy\n", " torch.cuda.empty_cache()" ] @@ -502,14 +548,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Replacing 'Matmul' modules using 'replace_non_param_layer' function\n", - "Replacing 'EltwiseMult' modules using 'replace_non_param_layer' function\n", - "Replacing 'EltwiseAdd' modules using 'replace_non_param_layer' function\n", - "Replacing 'BatchMatmul' modules using 'replace_non_param_layer' function\n", + "Replacing 'Conv2d' modules using 'replace_param_layer' function\n", + "Replacing 'Conv3d' modules using 'replace_param_layer' function\n", "Replacing 'Linear' modules using 'replace_param_layer' function\n", - "Replacing 'Embedding' modules using 'replace_embedding' function\n", "Replacing 'Concat' modules using 'replace_non_param_layer' function\n", - "Replacing 'Conv2d' modules using 'replace_param_layer' function\n" + "Replacing 'EltwiseAdd' modules using 'replace_non_param_layer' function\n", + "Replacing 'EltwiseMult' modules using 'replace_non_param_layer' function\n", + "Replacing 'Matmul' modules using 'replace_non_param_layer' function\n", + "Replacing 'BatchMatmul' modules using 'replace_non_param_layer' function\n", + "Replacing 'Embedding' modules using 'replace_embedding' function\n" ] } ], @@ -520,7 +567,8 @@ " model_activation_stats=stats_file)\n", "# We take a look at the replacement factory:\n", "for t, rf in quantizer.replacement_factory.items():\n", - " print(\"Replacing '{}' modules using '{}' function\".format(t.__name__, rf.__name__))" + " if rf is not None:\n", + " print(\"Replacing '{}' modules using '{}' function\".format(t.__name__, rf.__name__))" ] }, { @@ -547,11 +595,11 @@ "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", - "W0807 15:01:57.225715 140098610173696 range_linear.py:1063] /home/cvds_lab/lev_z/distiller/distiller/quantization/range_linear.py:1063: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", + "W1028 15:38:47.074526 140475898566400 range_linear.py:1275] /data2/users/gjacob/work/distiller/distiller/quantization/range_linear.py:1275: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", "Will perform specific optimization for the DistillerLSTM modules, but any other potential opportunities for optimization in the model will be ignored.\n", " 'opportunities for optimization in the model will be ignored.', UserWarning)\n", "\n", - "W0807 15:01:57.301426 140098610173696 quantizer.py:270] /home/cvds_lab/lev_z/distiller/distiller/quantization/quantizer.py:270: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", + "W1028 15:38:47.385626 140475898566400 quantizer.py:287] /data2/users/gjacob/work/distiller/distiller/quantization/quantizer.py:287: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", " UserWarning)\n", "\n" ] @@ -573,19 +621,23 @@ " )\n", " (eltwiseadd_residuals): ModuleList(\n", " (0): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.00788879394531, in_0_zero_point=0.0\n", - " in_1_scale=127.00006103515625, in_1_zero_point=0.0\n", - " out_scale=63.530452728271484, out_zero_point=0.0\n", + " output_scale=63.530453, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " (1): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.00004577636719, in_0_zero_point=0.0\n", - " in_1_scale=63.530452728271484, in_1_zero_point=0.0\n", - " out_scale=42.43059158325195, out_zero_point=0.0\n", + " output_scale=42.430592, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " )\n", @@ -595,62 +647,96 @@ " (rnn): DistillerLSTM(1024, 1024, num_layers=1, dropout=0.00, bidirectional=False)\n", " (attn): BahdanauAttention(\n", " (linear_q): RangeLinearQuantParamLayerWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=53.0649, w_zero_point=0.0000\n", - " in_scale=127.0000, in_zero_point=0.0000\n", - " out_scale=4.2200, out_zero_point=0.0000\n", + " output_scale=4.219996, output_zero_point=0.000000\n", + " weights_scale=53.064903, weights_zero_point=0.000000\n", " (wrapped_module): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (linear_k): RangeLinearQuantParamLayerWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=40.9795, w_zero_point=0.0000\n", - " in_scale=42.4306, in_zero_point=0.0000\n", - " out_scale=4.5466, out_zero_point=0.0000\n", + " output_scale=4.546572, output_zero_point=0.000000\n", + " weights_scale=40.979481, weights_zero_point=0.000000\n", " (wrapped_module): Linear(in_features=1024, out_features=1024, bias=False)\n", " )\n", " (dropout): Dropout(p=0)\n", " (eltwiseadd_qk): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=4.219996452331543, in_0_zero_point=0.0\n", - " in_1_scale=4.546571731567383, in_1_zero_point=0.0\n", - " out_scale=2.974982261657715, out_zero_point=0.0\n", + " output_scale=2.974982, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " (eltwiseadd_norm_bias): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=2.974982261657715, in_0_zero_point=0.0\n", - " in_1_scale=109.55178833007812, in_1_zero_point=0.0\n", - " out_scale=2.961179256439209, out_zero_point=0.0\n", + " output_scale=2.961179, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " (eltwisemul_norm_scaler): RangeLinearQuantEltwiseMultWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=771.8616943359375, in_0_zero_point=0.0\n", - " in_1_scale=100.56507873535156, in_1_zero_point=0.0\n", - " out_scale=611.1994018554688, out_zero_point=0.0\n", + " output_scale=611.199402, output_zero_point=0.000000\n", " (wrapped_module): EltwiseMult()\n", " )\n", - " (tanh): Tanh()\n", + " (tanh): RangeLinearFakeQuantWrapper(\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=False\n", + " scale_approx_mult_bits=None\n", + " preset_activation_stats=True\n", + " output_scale=127.000000, output_zero_point=0.000000\n", + " (wrapped_module): Tanh()\n", + " )\n", " (matmul_score): RangeLinearQuantMatmulWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.0000, in_0_zero_point=0.0000\n", - " in_1_scale=611.1994, in_1_zero_point=0.0000\n", - " out_scale=6.3274, out_zero_point=0.0000\n", + " output_scale=6.327397, output_zero_point=0.000000\n", " (wrapped_module): Matmul()\n", " )\n", - " (softmax_att): Softmax()\n", + " (softmax_att): RangeLinearFakeQuantWrapper(\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=False\n", + " scale_approx_mult_bits=None\n", + " preset_activation_stats=True\n", + " output_scale=128.141968, output_zero_point=0.000000\n", + " (wrapped_module): Softmax()\n", + " )\n", " (context_matmul): RangeLinearQuantMatmulWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=128.1420, in_0_zero_point=0.0000\n", - " in_1_scale=42.4306, in_1_zero_point=0.0000\n", - " out_scale=46.2056, out_zero_point=0.0000\n", + " output_scale=46.205608, output_zero_point=0.000000\n", " (wrapped_module): BatchMatmul()\n", " )\n", " )\n", @@ -666,56 +752,70 @@ " )\n", " (classifier): Classifier(\n", " (classifier): RangeLinearQuantParamLayerWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=40.8311, w_zero_point=0.0000\n", - " in_scale=42.8144, in_zero_point=0.0000\n", - " out_scale=6.3242, out_zero_point=0.0000\n", + " output_scale=6.324166, output_zero_point=0.000000\n", + " weights_scale=40.831116, weights_zero_point=0.000000\n", " (wrapped_module): Linear(in_features=1024, out_features=32317, bias=True)\n", " )\n", " )\n", " (dropout): Dropout(p=0.2)\n", " (eltwiseadd_residuals): ModuleList(\n", " (0): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.01639556884766, in_0_zero_point=0.0\n", - " in_1_scale=127.00018310546875, in_1_zero_point=0.0\n", - " out_scale=63.68953323364258, out_zero_point=0.0\n", + " output_scale=63.689533, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " (1): RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.00001525878906, in_0_zero_point=0.0\n", - " in_1_scale=63.68953323364258, in_1_zero_point=0.0\n", - " out_scale=42.81440353393555, out_zero_point=0.0\n", + " output_scale=42.814404, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", " )\n", " )\n", " (attention_concats): ModuleList(\n", " (0): RangeLinearQuantConcatWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.0, in_0_zero_point=0.0\n", - " in_1_scale=46.20560836791992, in_1_zero_point=0.0\n", - " out_scale=46.20560836791992, out_zero_point=0.0\n", + " output_scale=46.205608, output_zero_point=0.000000\n", " (wrapped_module): Concat()\n", " )\n", " (1): RangeLinearQuantConcatWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=127.00018310546875, in_0_zero_point=0.0\n", - " in_1_scale=46.20560836791992, in_1_zero_point=0.0\n", - " out_scale=46.20560836791992, out_zero_point=0.0\n", + " output_scale=46.205608, output_zero_point=0.000000\n", " (wrapped_module): Concat()\n", " )\n", " (2): RangeLinearQuantConcatWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=63.68953323364258, in_0_zero_point=0.0\n", - " in_1_scale=46.20560836791992, in_1_zero_point=0.0\n", - " out_scale=46.20560836791992, out_zero_point=0.0\n", + " output_scale=46.205608, output_zero_point=0.000000\n", " (wrapped_module): Concat()\n", " )\n", " )\n", @@ -760,14 +860,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 24/24 [13:49<00:00, 31.91s/it]\n" + "100%|██████████| 24/24 [15:15<00:00, 33.37s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "BLEU on test dataset: 18.05\n" + "Calculating BLEU score...\n", + "BLEU on test dataset: 18.04\n" ] } ], @@ -804,21 +905,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0807 15:15:53.630417 140098610173696 range_linear.py:1063] /home/cvds_lab/lev_z/distiller/distiller/quantization/range_linear.py:1063: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", - "Will perform specific optimization for the DistillerLSTM modules, but any other potential opportunities for optimization in the model will be ignored.\n", - " 'opportunities for optimization in the model will be ignored.', UserWarning)\n", - "\n", - "W0807 15:15:53.755373 140098610173696 quantizer.py:270] /home/cvds_lab/lev_z/distiller/distiller/quantization/quantizer.py:270: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", - " UserWarning)\n", - "\n", - "100%|██████████| 24/24 [14:25<00:00, 28.42s/it]\n" + "100%|██████████| 24/24 [17:19<00:00, 33.57s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "BLEU on test dataset: 18.52\n" + "Calculating BLEU score...\n", + "BLEU on test dataset: 18.4\n" ] } ], @@ -876,21 +971,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0807 15:30:25.722338 140098610173696 range_linear.py:1063] /home/cvds_lab/lev_z/distiller/distiller/quantization/range_linear.py:1063: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", - "Will perform specific optimization for the DistillerLSTM modules, but any other potential opportunities for optimization in the model will be ignored.\n", - " 'opportunities for optimization in the model will be ignored.', UserWarning)\n", - "\n", - "W0807 15:30:26.860130 140098610173696 quantizer.py:270] /home/cvds_lab/lev_z/distiller/distiller/quantization/quantizer.py:270: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", - " UserWarning)\n", - "\n", - "100%|██████████| 24/24 [13:58<00:00, 29.49s/it]\n" + "100%|██████████| 24/24 [15:55<00:00, 34.44s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "BLEU on test dataset: 9.63\n" + "Calculating BLEU score...\n", + "BLEU on test dataset: 9.44\n" ] } ], @@ -921,21 +1010,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0807 15:44:32.041999 140098610173696 range_linear.py:1063] /home/cvds_lab/lev_z/distiller/distiller/quantization/range_linear.py:1063: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", - "Will perform specific optimization for the DistillerLSTM modules, but any other potential opportunities for optimization in the model will be ignored.\n", - " 'opportunities for optimization in the model will be ignored.', UserWarning)\n", - "\n", - "W0807 15:44:33.205797 140098610173696 quantizer.py:270] /home/cvds_lab/lev_z/distiller/distiller/quantization/quantizer.py:270: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", - " UserWarning)\n", - "\n", - "100%|██████████| 24/24 [13:54<00:00, 32.72s/it]\n" + "100%|██████████| 24/24 [15:22<00:00, 33.23s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "BLEU on test dataset: 16.94\n" + "Calculating BLEU score...\n", + "BLEU on test dataset: 16.71\n" ] } ], @@ -973,21 +1056,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "W0807 15:58:34.769241 140098610173696 range_linear.py:1063] /home/cvds_lab/lev_z/distiller/distiller/quantization/range_linear.py:1063: UserWarning: Model contains a bidirectional DistillerLSTM module. Automatic BN folding and statistics optimization based on tracing is not yet supported for models containing such modules.\n", - "Will perform specific optimization for the DistillerLSTM modules, but any other potential opportunities for optimization in the model will be ignored.\n", - " 'opportunities for optimization in the model will be ignored.', UserWarning)\n", - "\n", - "W0807 15:58:35.894991 140098610173696 quantizer.py:270] /home/cvds_lab/lev_z/distiller/distiller/quantization/quantizer.py:270: UserWarning: Module 'decoder.embedder' references to same module as 'encoder.embedder'. Replacing with reference the same wrapper.\n", - " UserWarning)\n", - "\n", - "100%|██████████| 24/24 [13:14<00:00, 28.11s/it]\n" + "100%|██████████| 24/24 [16:43<00:00, 39.45s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "BLEU on test dataset: 21.49\n" + "Calculating BLEU score...\n", + "BLEU on test dataset: 21.48\n" ] } ], diff --git a/examples/word_language_model/manual_lstm_pretrained_stats.yaml b/examples/word_language_model/acts_quantization_stats.yaml similarity index 100% rename from examples/word_language_model/manual_lstm_pretrained_stats.yaml rename to examples/word_language_model/acts_quantization_stats.yaml diff --git a/examples/word_language_model/quantize_lstm.ipynb b/examples/word_language_model/quantize_lstm.ipynb index 3014ba5..2abc1b9 100644 --- a/examples/word_language_model/quantize_lstm.ipynb +++ b/examples/word_language_model/quantize_lstm.ipynb @@ -16,10 +16,7 @@ "The pretrained model was generated with: \n", "```time python3 main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --tied --wd=1e-6``` \n", "The reason we replace the LSTM that is because the inner operations in the pytorch implementation are not accessible to us, but we still want to quantize these operations. <br />\n", - "Afterwards we can try different techniques to quantize the whole model. \n", - "\n", - "_NOTE_: We use `tqdm` to plot progress bars, since it's not in `requirements.txt` you should install it using \n", - "`pip install tqdm`." + "Afterwards we can try different techniques to quantize the whole model. " ] }, { @@ -36,7 +33,11 @@ "from distiller.modules import DistillerLSTM as LSTM\n", "from tqdm import tqdm # for pretty progress bar\n", "import numpy as np\n", - "from copy import deepcopy" + "from copy import deepcopy\n", + "\n", + "import warnings\n", + "warnings.filterwarnings(action='default', module='distiller.quantization')\n", + "warnings.filterwarnings(action='default', module='distiller.quantization.range_linear')" ] }, { @@ -201,7 +202,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Max error in y: 0.000007\n" + "Max error in y: 0.000006\n" ] } ], @@ -298,16 +299,18 @@ "outputs": [], "source": [ "import os\n", - "from distiller.data_loggers import QuantCalibrationStatsCollector, collector_context\n", + "from distiller.data_loggers import collect_quant_stats\n", "\n", "man_model = torch.load('./manual.checkpoint.pth.tar')\n", "distiller.utils.assign_layer_fq_names(man_model)\n", "collector = QuantCalibrationStatsCollector(man_model)\n", "\n", - "if not os.path.isfile('manual_lstm_pretrained_stats.yaml'):\n", - " with collector_context(collector) as collector:\n", - " val_loss = evaluate(man_model, val_data)\n", - " collector.save('manual_lstm_pretrained_stats.yaml')" + "stats_file = './acts_quantization_stats.yaml'\n", + "\n", + "if not os.path.isfile(stats_file):\n", + " def eval_for_stats(model):\n", + " evaluate(man_model, val_data)\n", + " collect_quant_stats(man_model, eval_for_stats, save_dir='.')" ] }, { @@ -329,14 +332,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [00:21<00:00, 29.07it/s, val_loss=4.47, ppl=87.7]" + "100%|██████████| 622/622 [00:21<00:00, 29.07it/s, val_loss=4.47, ppl=87.3]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss: 4.47\t|\t ppl: 87.19\n" + "val_loss: 4.46\t|\t ppl: 86.79\n" ] }, { @@ -369,12 +372,23 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Logging before flag parsing goes to stderr.\n", + "W1030 16:04:58.930041 140055302981376 tensor.py:435] /data2/users/gjacob/work/venvs/distiller/lib/python3.5/site-packages/torch/tensor.py:435: RuntimeWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).\n", + " 'incorrect results).', category=RuntimeWarning)\n", + "\n" + ] + } + ], "source": [ "# Define the quantizer\n", "quantizer = PostTrainLinearQuantizer(\n", " deepcopy(man_model),\n", - " model_activation_stats='./manual_lstm_pretrained_stats.yaml')\n", + " model_activation_stats=stats_file)\n", "\n", "# Quantizer magic\n", "stats_before_prepare = deepcopy(quantizer.model_activation_stats)\n", @@ -462,11 +476,15 @@ " )\n", " (rnn): DistillerLSTM(1500, 1500, num_layers=2, dropout=0.65, bidirectional=False)\n", " (decoder): RangeLinearQuantParamLayerWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=107.5208, w_zero_point=0.0000\n", - " in_scale=127.0004, in_zero_point=0.0000\n", - " out_scale=3.6561, out_zero_point=0.0000\n", + " output_scale=3.656110, output_zero_point=0.000000\n", + " weights_scale=123.293175, weights_zero_point=0.000000\n", " (wrapped_module): Linear(in_features=1500, out_features=33278, bias=True)\n", " )\n", ")" @@ -498,19 +516,25 @@ "output_type": "stream", "text": [ "RangeLinearQuantParamLayerWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=False, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=165.1717, w_zero_point=0.0000\n", - " in_scale=126.2964, in_zero_point=0.0000\n", - " out_scale=17.7113, out_zero_point=0.0000\n", + " output_scale=17.711266, output_zero_point=0.000000\n", + " weights_scale=163.081299, weights_zero_point=0.000000\n", " (wrapped_module): Linear(in_features=1500, out_features=6000, bias=True)\n", ")\n", "RangeLinearQuantEltwiseAddWrapper(\n", - " mode=SYMMETRIC, num_bits_acts=8, num_bits_accum=32, clip_acts=NONE, scale_approx_mult_bits=None\n", + " output_quant_settings=(num_bits=8 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " in_0_scale=17.711265563964844, in_0_zero_point=0.0\n", - " in_1_scale=10.280694961547852, in_1_zero_point=0.0\n", - " out_scale=21.166667938232422, out_zero_point=0.0\n", + " output_scale=21.166668, output_zero_point=0.000000\n", " (wrapped_module): EltwiseAdd()\n", ")\n" ] @@ -541,14 +565,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [01:58<00:00, 5.57it/s, val_loss=4.64, ppl=104] " + "100%|██████████| 622/622 [03:11<00:00, 3.46it/s, val_loss=4.65, ppl=105] " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss: 4.64\t|\t ppl: 103.22\n" + "val_loss: 4.65\t|\t ppl: 104.20\n" ] }, { @@ -594,11 +618,15 @@ " )\n", " (rnn): DistillerLSTM(1500, 1500, num_layers=2, dropout=0.65, bidirectional=False)\n", " (decoder): RangeLinearQuantParamLayerWrapper(\n", - " mode=ASYMMETRIC_SIGNED, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=NONE, per_channel_wts=True, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_SIGNED ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=True)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_SIGNED ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=PerCh, w_zero_point=PerCh\n", - " in_scale=127.5069, in_zero_point=1.0000\n", - " out_scale=5.0241, out_zero_point=48.0000\n", + " output_scale=5.024112, output_zero_point=48.000000\n", + " weights_scale=PerCh, weights_zero_point=PerCh\n", " (wrapped_module): Linear(in_features=1500, out_features=33278, bias=True)\n", " )\n", ")" @@ -612,7 +640,7 @@ "source": [ "quantizer = PostTrainLinearQuantizer(\n", " deepcopy(man_model),\n", - " model_activation_stats='./manual_lstm_pretrained_stats.yaml',\n", + " model_activation_stats=stats_file,\n", " mode=LinearQuantMode.ASYMMETRIC_SIGNED,\n", " per_channel_wts=True\n", ")\n", @@ -629,14 +657,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [02:08<00:00, 5.08it/s, val_loss=4.61, ppl=101] " + "100%|██████████| 622/622 [03:09<00:00, 3.54it/s, val_loss=4.62, ppl=101] " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss: 4.61\t|\t ppl: 100.35\n" + "val_loss: 4.61\t|\t ppl: 100.45\n" ] }, { @@ -677,14 +705,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [00:23<00:00, 26.53it/s, val_loss=4.47, ppl=87.7]" + "100%|██████████| 622/622 [00:22<00:00, 27.95it/s, val_loss=4.47, ppl=87.3]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss: 4.467975\t|\t ppl: 87.18\n" + "val_loss: 4.463559\t|\t ppl: 86.80\n" ] }, { @@ -715,6 +743,18 @@ "execution_count": 19, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W1030 16:35:44.686105 140055302981376 range_linear.py:1126] /data2/users/gjacob/work/distiller/distiller/quantization/range_linear.py:1126: DeprecationWarning: Argument 'fp16' is deprecated. Please use 'fpq_module'(=16/32/64) argument.\n", + " DeprecationWarning)\n", + "\n", + "W1030 16:35:44.896657 140055302981376 range_linear.py:1093] /data2/users/gjacob/work/distiller/distiller/quantization/range_linear.py:1093: DeprecationWarning: Argument 'fp16' is deprecated. Please use 'fpq_module'(=16/32/64) argument.\n", + " DeprecationWarning)\n", + "\n" + ] + }, { "data": { "text/plain": [ @@ -723,7 +763,7 @@ " (wrapped_module): Embedding(33278, 1500)\n", " )\n", " (rnn): DistillerLSTM(1500, 1500, num_layers=2, dropout=0.65, bidirectional=False)\n", - " (decoder): FP16Wrapper(\n", + " (decoder): FPWrapper(\n", " (wrapped_module): Linear(in_features=1500, out_features=33278, bias=True)\n", " )\n", ")" @@ -746,7 +786,7 @@ "overrides = distiller.utils.yaml_ordered_load(overrides_yaml)\n", "quantizer = PostTrainLinearQuantizer(\n", " deepcopy(man_model),\n", - " model_activation_stats='./manual_lstm_pretrained_stats.yaml',\n", + " model_activation_stats=stats_file,\n", " mode=LinearQuantMode.ASYMMETRIC_SIGNED,\n", " overrides=overrides,\n", " per_channel_wts=True\n", @@ -764,14 +804,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [01:04<00:00, 9.68it/s, val_loss=4.47, ppl=87.7] " + "100%|██████████| 622/622 [01:45<00:00, 6.27it/s, val_loss=4.47, ppl=87.3] " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss:4.468113\t|\t ppl: 87.19\n" + "val_loss:4.463292\t|\t ppl: 86.77\n" ] }, { @@ -813,14 +853,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [02:09<00:00, 5.22it/s, val_loss=4.5, ppl=90.3] " + "100%|██████████| 622/622 [03:04<00:00, 3.61it/s, val_loss=4.49, ppl=89.5] " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss:4.497443\t|\t ppl: 89.79\n" + "val_loss:4.488176\t|\t ppl: 88.96\n" ] }, { @@ -841,7 +881,7 @@ "overrides = distiller.utils.yaml_ordered_load(overrides_yaml)\n", "quantizer = PostTrainLinearQuantizer(\n", " deepcopy(man_model),\n", - " model_activation_stats='./manual_lstm_pretrained_stats.yaml',\n", + " model_activation_stats=stats_file,\n", " mode=LinearQuantMode.ASYMMETRIC_SIGNED,\n", " overrides=overrides,\n", " per_channel_wts=True,\n", @@ -868,14 +908,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 622/622 [02:12<00:00, 5.10it/s, val_loss=4.51, ppl=90.5] " + "100%|██████████| 622/622 [03:05<00:00, 3.54it/s, val_loss=4.49, ppl=89.4] " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "val_loss:4.500023\t|\t ppl: 90.02\n" + "val_loss:4.486969\t|\t ppl: 88.85\n" ] }, { @@ -889,7 +929,7 @@ "source": [ "quantizer = PostTrainLinearQuantizer(\n", " deepcopy(man_model),\n", - " model_activation_stats='./manual_lstm_pretrained_stats.yaml',\n", + " model_activation_stats=stats_file,\n", " mode=LinearQuantMode.ASYMMETRIC_SIGNED,\n", " per_channel_wts=True,\n", " clip_acts=\"AVG\"\n", @@ -913,11 +953,15 @@ " )\n", " (rnn): DistillerLSTM(1500, 1500, num_layers=2, dropout=0.65, bidirectional=False)\n", " (decoder): RangeLinearQuantParamLayerWrapper(\n", - " mode=ASYMMETRIC_SIGNED, num_bits_acts=8, num_bits_params=8, num_bits_accum=32, clip_acts=AVG, per_channel_wts=True, scale_approx_mult_bits=None\n", + " weights_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_SIGNED ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=True)\n", + " output_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_SIGNED ; clip_mode=AVG ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " accum_quant_settings=(num_bits=32 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)\n", + " requires_quantized_inputs=True\n", + " inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None\n", + " scale_approx_mult_bits=None\n", " preset_activation_stats=True\n", - " w_scale=PerCh, w_zero_point=PerCh\n", - " in_scale=129.4670, in_zero_point=1.0000\n", - " out_scale=9.9393, out_zero_point=56.0000\n", + " output_scale=9.939270, output_zero_point=56.000000\n", + " weights_scale=PerCh, weights_zero_point=PerCh\n", " (wrapped_module): Linear(in_features=1500, out_features=33278, bias=True)\n", " )\n", ")" -- GitLab