Mandylove1993 / CUDA-FastBEV

TensorRT deploy and PTQ/QAT tools development for FastBEV, total time only need 6.9ms!!!
MIT License
233 stars 37 forks source link

Post training Quantization does not work on custom model #36

Open BaophanN opened 2 weeks ago

BaophanN commented 2 weeks ago

Dear author, thanks for the amazing work. I adapted the code in quantize folder to quantize my custom model (BEVFormer encoder). I still use Resnet and FPN as backbone and neck. However, when I run, the weights are still in fp32. Can you give me some suggestion. Here are the source code: Driver code

from ptq_bev import * 
import torch
import time 
from mmdet3d.models import build_model
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel

from mmcv.runner import load_checkpoint
from mmdet3d.datasets import build_dataloader, build_dataset
from mmcv.cnn import get_model_complexity_info
import torch.quantization
import sys 
import copy 
from mmdet.apis import set_random_seed
sys.path.insert(0,'workspace/source/Mapless')

def main(): 
    quantize.initialize()
    parser = argparse.ArgumentParser() 
    parser.add_argument("--calibrate_batch", type=int, default=200, help="calibrate batch")
    parser.add_argument("--deterministic", type=bool, default=True, help="deterministic")
    parser.add_argument('--test_int8', type=bool, default=False, help='test int8 or not')
    parser.add_argument('--test_fp32', type=bool, default=False, help='test fp32 or not')

    parser.add_argument("--seed", type=int, default=0, help="seed")
    parser.add_argument('--show', action='store_true', help='show results')
    parser.add_argument('--show-dir', default='work_dirs/lanesegnet_quantize/show' , help='directory where results will be saved')
    parser.add_argument('--out', help='output result file in pickle format')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument(
        '--eval',
        type=str,
        default='bbox',
        help='evaluation metrics, which depends on the dataset, e.g., "mAP",'
        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
    parser.add_argument(
        '--eval-options',
        nargs='+',
        action=DictAction,
        help='custom options for evaluation, the key-value pair in xxx=yyy '
        'format will be kwargs for dataset.evaluate() function')
    parser.add_argument(
        '--format-only',
        action='store_true',
        help='Format the output results without perform evaluation. It is'
        'useful when you want to format the result to a specific format and '
        'submit it to the test server')
    parser.add_argument('--samples', default=2000, help='samples to benchmark')
    parser.add_argument(
        '--out-dir', default='work_dirs/lanesegnet_quantize', help='directory where results will be saved')
    parser.add_argument(
        '--log-interval', default=50, help='interval of logging')
    parser.add_argument(
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
        'the inference speed')    
    args = parser.parse_args()

    # Load the configuration
    save_path = 'work_dirs/lanesegnet_quantize/quantized_backbone.pth'
    os.makedirs(os.path.dirname(save_path), exist_ok=True) 
    config_path = 'plugin/LaneSegNet/configs/lanesegnet_r18_1x1_1e_olv2_subset_A.py'
    checkpoint_path = 'work_dirs/lanesegnet_debug/epoch_24.pth'
    cfg = Config.fromfile(config_path)
    cuda_device = torch.device("cuda:0")
    cpu_device = torch.device("cpu:0")

    if args.seed is not None:
        set_random_seed(args.seed, deterministic=args.deterministic)
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    # Create the model
    # Load checkpoint
    dataset = build_dataset(cfg.data.test)
    distributed = False 
    samples_per_gpu = 1 # enough

    data_loader = build_dataloader(
        dataset,
        samples_per_gpu=samples_per_gpu, 
        workers_per_gpu = cfg.data.workers_per_gpu, 
        dist=distributed, 
        shuffle=False,
    )
    model_fp32 = build_model(cfg.model)

    checkpoint = load_checkpoint(model_fp32, checkpoint_path, map_location='cpu')
    # print(model_fp32.bev_constructor)
    # print_nn_layers_type(model_fp32)

    model_int8 = copy.deepcopy(model_fp32) 
    # model.train()
    # fused_model.train()

    if args.test_fp32:
        print("######## fp32 #########")
        model_fp32.to(cuda_device)
        model_fp32 = fuse_conv_bn(model_fp32)
        model_fp32 = MMDataParallel(model_fp32, device_ids=[0])
        model_fp32.eval()
        # test_model(cfg, args, model_fp32,checkpoint, data_loader, dataset)

    model_int8.to(cpu_device)
    model_int8 = quantize_net(model_int8) # how to make it work 
    model_int8 = fuse_conv_bn(model_int8)
    # print(model_int8.bev_constructor)
    model_int8 = MMDataParallel(model_int8,device_ids=[0])
    # print_nn_layers_type(model_int8)
    model_int8.eval()

    # calibrate
    print("start calibrate")
    quantize.set_quantizer_fast(model_int8) # Ins
    quantize.calibrate_model(model_int8, data_loader, 0, None, args.calibrate_batch)
    model_int8 = torch.quantization.convert(model_int8, inplace=True)

    torch.save(model_int8, save_path) 
    if args.test_int8: 
        print("######## int8 #########")
        model_int8.to(cuda_device)
        test_model(cfg, args, model_int8,checkpoint, data_loader, dataset) 

if __name__ == '__main__':
    main()

After quantizing, I printed out part of the config:

     (2): BEVFormerLayer(
        (attentions): ModuleList(
          (0): TemporalSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (sampling_offsets): QuantLinear(
              in_features=512, out_features=128, bias=True
              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=dynamic calibrator=HistogramCalibrator scale=1.0 quant)
              (_weight_quantizer): TensorQuantizer(8bit fake axis=0 amax=dynamic calibrator=MaxCalibrator scale=1.0 quant)
            )
            (attention_weights): QuantLinear(
              in_features=512, out_features=64, bias=True
              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=dynamic calibrator=HistogramCalibrator scale=1.0 quant)
              (_weight_quantizer): TensorQuantizer(8bit fake axis=0 amax=dynamic calibrator=MaxCalibrator scale=1.0 quant)
            )
            (value_proj): QuantLinear(
              in_features=256, out_features=256, bias=True
              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=dynamic calibrator=HistogramCalibrator scale=1.0 quant)
              (_weight_quantizer): TensorQuantizer(8bit fake axis=0 amax=dynamic calibrator=MaxCalibrator scale=1.0 quant)
            )
            (output_proj): QuantLinear(
              in_features=256, out_features=256, bias=True
              (_input_quantizer): TensorQuantizer(8bit fake per-tensor amax=dynamic calibrator=HistogramCalibrator scale=1.0 quant)
              (_weight_quantizer): TensorQuantizer(8bit fake axis=0 amax=dynamic calibrator=MaxCalibrator scale=1.0 quant)
            )
          )

I think I got something wrong because this is fake tensor_quantizer for quantization aware training. That is why when i save the quantized checkpoint of mine, weights are still in fp32.

Parameter Key: 'module.bev_constructor.can_bus_mlp.2._input_quantizer._amax' - Type: <class 'torch.Tensor'> - Tensor Type: torch.float32
Parameter Key: 'module.bev_constructor.can_bus_mlp.2._weight_quantizer._amax' - Type: <class 'torch.Tensor'> - Tensor Type: torch.float32
Parameter Key: 'module.bev_constructor.can_bus_mlp.norm.weight' - Type: <class 'torch.Tensor'> - Tensor Type: torch.float32
Parameter Key: 'module.bev_constructor.can_bus_mlp.norm.bias' - Type: <class 'torch.Tensor'> - Tensor Type: torch.float32