microsoft / nni

An open source AutoML toolkit for automate machine learning lifecycle, including feature engineering, neural architecture search, model compression and hyper-parameter tuning.
https://nni.readthedocs.io
MIT License
13.98k stars 1.81k forks source link

Using Naive Quantizer for Speed_up. #4404

Open YangNuoCheng opened 2 years ago

YangNuoCheng commented 2 years ago

Using Naive Quantizer for Speed_up. Short summary about the question / idea : Can I use Naive Quantizer for acceleration? The NaiveQuantizer class doesn't inherit "export_model" so I can't connect it directly to the Speedup module. In addition, I cannot get the acceleration effect in the NNI document below, are there other examples that can be used? When I use QATQuantizer with Tensorrt, the "all in 8 bit" speed is similar to the "mixed precision(average bit 20.4)" speed. Why? Does my GPU architecture not support acceleration?

How can I get the following acceleration ratio? 0.000753688->0.000229869 quantization strategy Latency accuracy
all in 32bit 0.001199961 96%
mixed precision(average bit 20.4) 0.000753688 96%
all in 8bit 0.000229869 93.7%

nni Environment :

Other Advice

Need to update document ( yes / no ) : yes

Other questions

When I use QAT in "mixed_precision_speedup_mnist.py" with this two configure_list below: ----------------------1-------------------------

configure_list = [{
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':8, 'weight':8},
            'op_names': ['conv1']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':8},
            'op_names': ['relu1']
        }, {
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':8, 'weight':8},
            'op_names': ['conv2']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':8},
            'op_names': ['relu2']
        }
    ]

----------------------2-------------------------

configure_list = [{
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':16, 'weight':16},
            'op_names': ['conv1']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':16},
            'op_names': ['relu1']
        }, {
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':16, 'weight':16},
            'op_names': ['conv2']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':16},
            'op_names': ['relu2']
        }
    ]

I got the same speedup result : image image Did i misunderstand the use of QAT?What configure_list shell I use for "mixed precision(average bit 20.4)"and "all in 8bit"? Here is the code based on "mixed_precision_speedup_mnist.py"


def quantization_aware_training_example(train_loader, test_loader, device):
    model = NaiveModel()

    configure_list = [{
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':8, 'weight':8},
            'op_names': ['conv1']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':8},
            'op_names': ['relu1']
        }, {
            'quant_types': ['input', 'weight'],
            'quant_bits': {'input':8, 'weight':8},
            'op_names': ['conv2']
        }, {
            'quant_types': ['output'],
            'quant_bits': {'output':8},
            'op_names': ['relu2']
        }
    ]

    # configure_list = [{
    #         'quant_types': ['input', 'weight'],
    #         'quant_bits': {'input':16, 'weight':16},
    #         'op_names': ['conv1']
    #     }, {
    #         'quant_types': ['output'],
    #         'quant_bits': {'output':16},
    #         'op_names': ['relu1']
    #     }, {
    #         'quant_types': ['input', 'weight'],
    #         'quant_bits': {'input':16, 'weight':16},
    #         'op_names': ['conv2']
    #     }, {
    #         'quant_types': ['output'],
    #         'quant_bits': {'output':16},
    #         'op_names': ['relu2']
    #     }
    # ]
    set_quant_scheme_dtype('weight', 'per_tensor_symmetric', 'int')

    # finetune the model by using QAT
    # enable batchnorm folding mode
    dummy_input = torch.randn(1, 1, 28, 28)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, configure_list, optimizer, dummy_input=dummy_input)
    quantizer.compress()

    model.to(device)
    for epoch in range(1):
        print('# Epoch {} #'.format(epoch))
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)

    model_path = "mnist_model.pth"
    calibration_path = "mnist_calibration.pth"
    calibration_config = quantizer.export_model(model_path, calibration_path)

    test(model, device, test_loader)

    # print("calibration_config: ", calibration_config)

    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)

    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=batch_size)
    engine.compress()

    test_trt(engine, test_loader)
'''

<details>
<summary><mark><font color=darkred>Here are detail of output </font></mark></summary>
NaiveModel(
  (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=800, out_features=500, bias=True)
  (fc2): Linear(in_features=500, out_features=10, bias=True)
  (relu1): ReLU6()
  (relu2): ReLU6()
  (relu3): ReLU6()
  (max_pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (max_pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
# Epoch 0 #
 0%  Loss 2.323434591293335
11%  Loss 1.4983417987823486
21%  Loss 0.5049654841423035
32%  Loss 0.2734118402004242
43%  Loss 0.17638994753360748
53%  Loss 0.30009275674819946
64%  Loss 0.33725330233573914
75%  Loss 0.06641637533903122
85%  Loss 0.05598992854356766
96%  Loss 0.22033458948135376
Loss: 0.1314087776184082  Accuracy: 95.24%)

# Epoch 1 #
 0%  Loss 0.19872994720935822
11%  Loss 0.09655192494392395
21%  Loss 0.24496421217918396
32%  Loss 0.05480068176984787
43%  Loss 0.15562187135219574
53%  Loss 0.08811508119106293
64%  Loss 0.13239851593971252
75%  Loss 0.09597941488027573
85%  Loss 0.046043965965509415
96%  Loss 0.03178367018699646
Loss: 0.08881056289672852  Accuracy: 96.39%)

[2021-12-22 18:20:38] INFO (nni.compression.pytorch.compressor/MainThread) Model state_dict saved to mnist_model.pth
[2021-12-22 18:20:38] INFO (nni.compression.pytorch.compressor/MainThread) Mask dict saved to mnist_calibration.pth
calibration_config 
{'conv1': {'weight_bits': 4, 'weight_scale': tensor([0.0422], device='cuda:0'), 'weight_zero_point': tensor([6.], device='cuda:0'), 'input_bits': 4, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075, 'output_bits': 4, 'tracked_min_output': -4.980009078979492, 'tracked_max_output': 8.239447593688965}, 'conv2': {'weight_bits': 4, 'weight_scale': tensor([0.0162], device='cuda:0'), 'weight_zero_point': tensor([7.], device='cuda:0'), 'input_bits': 4, 'tracked_min_input': 0.0, 'tracked_max_input': 5.99997615814209, 'output_bits': 4, 'tracked_min_output': -9.3397798538208, 'tracked_max_output': 10.353096008300781}, 'fc1': {'weight_bits': 4, 'weight_scale': tensor([0.0082], device='cuda:0'), 'weight_zero_point': tensor([7.], device='cuda:0'), 'input_bits': 4, 'tracked_min_input': 0.0, 'tracked_max_input': 5.99997615814209, 'output_bits': 4, 'tracked_min_output': -6.169345378875732, 'tracked_max_output': 9.513076782226562}, 'fc2': {'weight_bits': 4, 'weight_scale': tensor([0.0211], device='cuda:0'), 'weight_zero_point': tensor([8.], device='cuda:0'), 'input_bits': 4, 'tracked_min_input': 0.0, 'tracked_max_input': 5.99997615814209, 'output_bits': 4, 'tracked_min_output': -21.839794158935547, 'tracked_max_output': 12.17343807220459}, 'relu1': {'output_bits': 4, 'tracked_min_output': 0.0, 'tracked_max_output': 5.99997615814209}, 'relu2': {'output_bits': 4, 'tracked_min_output': 0.0, 'tracked_max_output': 5.99997615814209}, 'relu3': {'output_bits': 4, 'tracked_min_output': 0.0, 'tracked_max_output': 5.99997615814209}}
Loss: 0.07442319641113282  Accuracy: 97.69%)

[12/22/2021-18:20:40] [TRT] [W] onnx2trt_utils.cpp:366: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[12/22/2021-18:20:40] [TRT] [W] DynamicRange(min: -0.424213, max: 2.82149). Dynamic range should be symmetric for better accuracy.
[12/22/2021-18:20:40] [TRT] [W] DynamicRange(min: -4.98001, max: 8.23945). Dynamic range should be symmetric for better accuracy.
[12/22/2021-18:20:40] [TRT] [W] DynamicRange(min: -9.33978, max: 10.3531). Dynamic range should be symmetric for better accuracy.
[12/22/2021-18:20:40] [TRT] [W] DynamicRange(min: -6.16935, max: 9.51308). Dynamic range should be symmetric for better accuracy.
[12/22/2021-18:20:40] [TRT] [W] DynamicRange(min: -21.8398, max: 12.1734). Dynamic range should be symmetric for better accuracy.
[12/22/2021-18:20:40] [TRT] [W] Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
[12/22/2021-18:20:40] [TRT] [W] Missing scale and zero-point for tensor 27, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/22/2021-18:20:40] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer* 14) [Shuffle]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/22/2021-18:20:40] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer* 15) [Softmax]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/22/2021-18:20:40] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer* 16) [Unary]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/22/2021-18:20:40] [TRT] [W] Missing scale and zero-point for tensor output1, expect fall back to non-int8 implementation for any layer consuming or producing given tensor
[12/22/2021-18:20:41] [TRT] [W] Rejecting some int8 implementation of layer Reshape_19 + (Unnamed Layer* 7) [Shuffle] due to missing int8 scales for tensor 27 at input index 0
[12/22/2021-18:20:44] [TRT] [W] No implementation of layer Reshape_19 + (Unnamed Layer* 7) [Shuffle] obeys the requested constraints. I.e. no conforming implementation was found for requested layer computation precision and output precision. Using fastest implementation instead.
[12/22/2021-18:20:45] [TRT] [W] No implementation of layer Gemm_28 obeys the requested constraints. I.e. no conforming implementation was found for requested layer computation precision and output precision. Using fastest implementation instead.
[12/22/2021-18:20:45] [TRT] [W] No implementation of layer (Unnamed Layer* 13) [Shuffle] + (Unnamed Layer* 14) [Shuffle] obeys the requested constraints. I.e. no conforming implementation was found for requested layer computation precision and output precision. Using fastest implementation instead.
<class 'tensorrt.tensorrt.ICudaEngine'>
Loss: 0.0742526138305664  Accuracy: 97.68%
Inference elapsed_time (whole dataset): 0.032988786697387695s

</details>
J-shang commented 2 years ago

NaiveQuantizer doesn't support speed up, it just simulated to see the effect of if we directly quantize the weight.

FYI, I think your GPU support int8, https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix

@linbinskn, please help to explain the mixed precision.

YangNuoCheng commented 2 years ago

NaiveQuantizer doesn't support speed up, it just simulated to see the effect of if we directly quantize the weight.

FYI, I think your GPU support int8, https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix

@linbinskn, please help to explain the mixed precision.

Thank you for your reply! I am concerned about whether I use different config_list really quantify the model to different type. Because I observed that their accuracy, loss and latency variations were very similar. Do I misused the config_list of QAT?

Why is there no significant acceleration?

linbinskn commented 2 years ago

Your gpu card supports low precision very well since Titan RTX is boosted by turing architecture which leverages tensorcore low precision speedup. I think config_list is right in terms of format and it may really work. The result in document is model accuracy after post-training quantization whose accuracy should be lower than quantize aware training. Actually, mnist is a very simple task so that qat should have similar accuracy to pretrained-model. There are several reasons for why they have same speedup result. The most possible one influential may be only some parts of network are quantized while others are keeping original precision. Maybe you can try the following config to see the further result. configure_list = [{ 'quant_types': ['weight', 'input'], 'quant_bits': {'weight': 8, 'input': 8}, 'op_names': ['conv1', 'conv2'] }, { 'quant_types': ['output'], 'quant_bits': {'output': 8, }, 'op_names': ['relu1', 'relu2'] }, { 'quant_types': ['output', 'weight', 'input'], 'quant_bits': {'output': 8, 'weight': 8, 'input': 8}, 'op_names': ['fc1', 'fc2'], }]

YangNuoCheng commented 2 years ago

Thanks for your reply!

You solved a lot of my problems! But I still got the following results: all in 8bit all in 16bit
0.034036874771118164s 0.03283333778381348s
0.03186678886413574s 0.032286882400512695s
0.03266406059265137s 0.03229546546936035s
0.032309532165527344s 0.03174257278442383s
0.032884836196899414s 0.03217005729675293s

which all in 8bit still similar to all in 16bit, are there other reasons led to speedup result? Using new_config even spend more time than before!(In my first submit it cost 0.026126s,now 0.032s!) How NNI document did? Are the all in 8bit code result used QAT?

here are some output detials

NaiveModel( (conv1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1)) (conv2): Conv2d(20, 50, kernel_size=(5, 5), stride=(1, 1)) (fc1): Linear(in_features=800, out_features=500, bias=True) (fc2): Linear(in_features=500, out_features=10, bias=True) (relu1): ReLU6() (relu2): ReLU6() (relu3): ReLU6() (max_pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) (max_pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) )

Epoch 0

0% Loss 2.320648670196533 11% Loss 1.5353659391403198 21% Loss 0.5524840950965881 32% Loss 0.2670537531375885 43% Loss 0.15946835279464722 53% Loss 0.23263391852378845 64% Loss 0.30145013332366943 75% Loss 0.06855656951665878 85% Loss 0.04781820625066757 96% Loss 0.21670007705688477 Loss: 0.09943255615234375 Accuracy: 96.9%)

[2021-12-22 21:55:41] INFO (nni.compression.pytorch.compressor/MainThread) Model state_dict saved to mnist_model.pth [2021-12-22 21:55:41] INFO (nni.compression.pytorch.compressor/MainThread) Mask dict saved to mnist_calibration.pth calibration_config {'conv1': {'weight_bits': 16, 'weight_scale': tensor([9.5225e-06], device='cuda:0'), 'weight_zero_point': tensor([26380.], device='cuda:0'), 'input_bits': 16, 'tracked_min_input': -0.4242129623889923, 'tracked_max_input': 2.821486711502075}, 'conv2': {'weight_bits': 16, 'weight_scale': tensor([3.4201e-06], device='cuda:0'), 'weight_zero_point': tensor([30375.], device='cuda:0'), 'input_bits': 16, 'tracked_min_input': 0.0, 'tracked_max_input': 5.998416423797607}, 'fc1': {'weight_bits': 16, 'weight_scale': tensor([1.6138e-06], device='cuda:0'), 'weight_zero_point': tensor([32241.], device='cuda:0'), 'input_bits': 16, 'tracked_min_input': 0.0, 'tracked_max_input': 5.987966537475586, 'output_bits': 16, 'tracked_min_output': -5.204525947570801, 'tracked_max_output': 7.360141277313232}, 'fc2': {'weight_bits': 16, 'weight_scale': tensor([4.0696e-06], device='cuda:0'), 'weight_zero_point': tensor([32722.], device='cuda:0'), 'input_bits': 16, 'tracked_min_input': 0.0, 'tracked_max_input': 5.899948596954346, 'output_bits': 16, 'tracked_min_output': -18.205446243286133, 'tracked_max_output': 8.411518096923828}, 'relu1': {'output_bits': 16, 'tracked_min_output': 0.0, 'tracked_max_output': 5.9998369216918945}, 'relu2': {'output_bits': 16, 'tracked_min_output': 0.0, 'tracked_max_output': 5.998589992523193}, 'relu3': {'output_bits': 16, 'tracked_min_output': 0.0, 'tracked_max_output': 5.983097553253174}} Loss: 0.09942604522705079 Accuracy: 96.9%)

[12/22/2021-21:55:43] [TRT] [W] onnx2trt_utils.cpp:366: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32. [12/22/2021-21:55:43] [TRT] [W] DynamicRange(min: -0.424213, max: 2.82149). Dynamic range should be symmetric for better accuracy. [12/22/2021-21:55:43] [TRT] [W] DynamicRange(min: -5.20453, max: 7.36014). Dynamic range should be symmetric for better accuracy. [12/22/2021-21:55:43] [TRT] [W] DynamicRange(min: -18.2054, max: 8.41152). Dynamic range should be symmetric for better accuracy. [12/22/2021-21:55:43] [TRT] [W] Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool. [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor 12, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor 21, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor 27, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer 14) [Shuffle]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer 15) [Softmax]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor (Unnamed Layer* 16) [Unary]_output, expect fall back to non-int8 implementation for any layer consuming or producing given tensor [12/22/2021-21:55:43] [TRT] [W] Missing scale and zero-point for tensor output1, expect fall back to non-int8 implementation for any layer consuming or producing given tensor <class 'tensorrt.tensorrt.ICudaEngine'> Loss: 0.09943007965087891 Accuracy: 96.91% Inference elapsed_time (whole dataset): 0.03217005729675293s

here are some detials of quantization_aware_training_example

```python
def quantization_aware_training_example(train_loader, test_loader, device):
    model = NaiveModel()
    bits = 16
    configure_list = [
        { 'quant_types': ['weight', 'input'], 
         'quant_bits': {'weight': bits, 'input': bits}, 
         'op_names': ['conv1', 'conv2'] 
        }, 
        { 'quant_types': ['output'], 
         'quant_bits': {'output': bits }, 
         'op_names': ['relu1', 'relu2','relu3'] 
        }, 
        { 'quant_types': ['output', 'weight', 'input'], 
         'quant_bits': {'output': bits, 'weight': bits, 'input': bits},
         'op_names': ['fc1', 'fc2'], }]
    print(model)
    dummy_input = torch.randn(1, 1, 28, 28)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    quantizer = QAT_Quantizer(model, configure_list, optimizer, dummy_input=dummy_input)
    # quantizer = QAT_Quantizer(model, configure_list, optimizer)
    quantizer.compress()
    model.to(device)
    for epoch in range(1):
        print('# Epoch {} #'.format(epoch))
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)

    model_path = "mnist_model.pth"
    calibration_path = "mnist_calibration.pth"
    calibration_config = quantizer.export_model(model_path, calibration_path)

    test(model, device, test_loader)

    # print("calibration_config: ", calibration_config)

    batch_size = 32
    input_shape = (batch_size, 1, 28, 28)

    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=batch_size)
    engine.compress()

    test_trt(engine, test_loader)