Bad accuracy after converting transformer to TensorRT, particularly nn.MultiheadAttention

ninono12345 commented 9 months ago

Description

Hello, I've been working on a project to convert a tracking algorithm to TensorRT. By converting one part of the code to TensorRT runing inference, then another part of the code... I've tracked down that the accuracy issues lie within this code:

`import atexit import tensorrt as trt from polygraphy.backend.trt import EngineFromNetwork, NetworkFromOnnxPath, TrtRunner superfinal_polygraphy = True pol1 = True pol2 = True pol3 = True

def MLP(channels, do_bn=True):

n = len(channels)
layers = []
for i in range(1, n):
    layers.append(
        nn.Conv1d(channels[i - 1], channels[i], kernel_size=1, bias=True))
    if i < (n-1):
        if do_bn:
            layers.append(nn.BatchNorm1d(channels[i]))
        layers.append(nn.ReLU())
return nn.Sequential(*layers)

class FilterPredictor(nn.Module):

def __init__(self, transformer, feature_sz, use_test_frame_encoding=True):
    print("feature_sz filter")
    print(feature_sz)
    super().__init__()
    self.transformer = transformer
    self.feature_sz = feature_sz
    self.use_test_frame_encoding = use_test_frame_encoding

    self.box_encoding = MLP([4, self.transformer.d_model//4, self.transformer.d_model, self.transformer.d_model])

    self.query_embed_fg = nn.Embedding(1, self.transformer.d_model)

    if self.use_test_frame_encoding:
        self.query_embed_test = nn.Embedding(1, self.transformer.d_model)

    self.query_embed_fg_decoder = self.query_embed_fg

    self.pos_encoding = PositionEmbeddingSine(num_pos_feats=self.transformer.d_model//2, sine_type='lin_sine',
                                              avoid_aliazing=True, max_spatial_resolution=feature_sz)

    if superfinal_polygraphy:
        with open("transformer_op17.engine", "rb") as f: # pol1
            engine_data1 = f.read()
        runtime1 = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        engine1 = runtime1.deserialize_cuda_engine(engine_data1)
        with open("box_encoding.engine", "rb") as f: # pol2
            engine_data2 = f.read()
        runtime2 = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        engine2 = runtime2.deserialize_cuda_engine(engine_data2)
        with open("PositionEmbeddingSine.engine", "rb") as f: # pol3
            engine_data3 = f.read()
        runtime3 = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        engine3 = runtime3.deserialize_cuda_engine(engine_data3)

        self.trt_engine1 = TrtRunner(engine1)
        self.trt_engine2 = TrtRunner(engine2)
        self.trt_engine3 = TrtRunner(engine3)
        self.trt_engine1.activate()
        self.trt_engine2.activate()
        self.trt_engine3.activate()

        def exit_handler():
            self.trt_engine1.deactivate()
            self.trt_engine2.deactivate()
            self.trt_engine3.deactivate()

        atexit.register(exit_handler)
        print("box_encoding loaded")

def forward(self, train_feat, test_feat, train_label, train_ltrb_target):
    return self.predict_cls_bbreg_filters_parallel(train_feat, test_feat, train_label, 1, train_ltrb_target)

def get_positional_encoding(self, feat):
    nframes, nseq, _, h, w = feat.shape

    mask = torch.zeros((nframes * nseq, h, w), dtype=torch.bool, device=feat.device)
    tt = torch.tensor((), dtype=torch.bool, device=feat.device)
    if not pol3:
        pos = self.pos_encoding(mask)
    if pol3:
        out = self.trt_engine3.infer({'mask':mask})

        pos = out['pos'].cuda()

    return pos.reshape(nframes, nseq, -1, h, w)

def predict_cls_bbreg_filters_parallel(self, train_feat, test_feat, train_label, num_gth_frames, train_ltrb_target):
       train_feat = train_feat.unsqueeze(1)
       test_feat = test_feat.unsqueeze(1)
       train_ltrb_target = train_ltrb_target.unsqueeze(1)

    h, w = test_feat.shape[-2:]
    H, W = train_feat.shape[-2:]

    train_feat_stack = torch.cat([train_feat, train_feat], dim=1)
    test_feat_stack = torch.cat([test_feat, test_feat], dim=1)
    train_label_stack = torch.cat([train_label, train_label], dim=1)
    train_ltrb_target_stack = torch.cat([train_ltrb_target, train_ltrb_target], dim=1)

    test_pos = self.get_positional_encoding(test_feat)  # Nf_te, Ns, C, H, W #ok
    train_pos = self.get_positional_encoding(train_feat)  # Nf_tr, Ns, C, H, W #ok

    test_feat_seq = test_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_te*H*W, Ns, C
    train_feat_seq = train_feat_stack.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)  # Nf_tr*H*W, Ns, C
    train_label_seq = train_label_stack.permute(1, 0, 2, 3).flatten(1).permute(1, 0).unsqueeze(2)  # Nf_tr*H*W,Ns,1
    train_ltrb_target_seq_T = train_ltrb_target_stack.permute(1, 2, 0, 3, 4).flatten(2)  # Ns,4,Nf_tr*H*W

    test_pos = test_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)
    train_pos = train_pos.permute(1, 2, 0, 3, 4).flatten(2).permute(2, 0, 1)

    fg_token = self.query_embed_fg.weight.reshape(1, 1, -1)
    train_label_enc = fg_token * train_label_seq
    if not pol2:
        train_ltrb_target_enc = self.box_encoding(train_ltrb_target_seq_T)
    if pol2:
        out = self.trt_engine2.infer({'train_ltrb_target_seq_T':train_ltrb_target_seq_T})

        train_ltrb_target_enc = out['train_ltrb_target_enc'].cuda()

    train_ltrb_target_enc = train_ltrb_target_enc.permute(2, 0, 1)  # Nf_tr*H*H,Ns,C

    if self.use_test_frame_encoding:
        test_token = self.query_embed_test.weight.reshape(1, 1, -1)
        test_label_enc = torch.ones_like(test_feat_seq) * test_token # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq + test_label_enc], dim=0)
    else:
        feat = torch.cat([train_feat_seq + train_label_enc + train_ltrb_target_enc, test_feat_seq], dim=0)

    pos = torch.cat([train_pos, test_pos], dim=0)

    src_key_padding_mask = torch.zeros(feat.shape[1], feat.shape[0]).bool()
    src_key_padding_mask[1, num_gth_frames*H*W:-h*w] = 1.
    src_key_padding_mask = src_key_padding_mask.bool().to(feat.device)

    if not pol1:
        output_embed, enc_mem = self.transformer(feat, mask=src_key_padding_mask,
                                             query_embed=self.query_embed_fg_decoder.weight,
                                             pos_embed=pos)
    if pol1:
        out = self.trt_engine1.infer({'feat':feat,
                                                'mask':src_key_padding_mask,
                                                'query_embed':self.query_embed_fg_decoder.weight,
                                                'pos_embed':pos,})
        output_embed = out['output_embed'].cuda()
        enc_mem = out['enc_mem'].cuda()

    enc_opt = enc_mem[-h * w:].transpose(0, 1).permute(0, 2, 1)
    enc_opt = enc_opt.reshape(test_feat_stack.shape)
    dec_opt = output_embed.squeeze(0).transpose(1, 2)
    dec_opt = dec_opt.reshape(test_feat_stack.shape[1], -1, 1, 1)

    cls_enc_opt = enc_opt[:, 0].unsqueeze(1)
    bbreg_enc_opt = enc_opt[:, 1].unsqueeze(1)
    cls_dec_opt = dec_opt[0].unsqueeze(0)
    bbreg_dec_opt = dec_opt[1].unsqueeze(0)

    return cls_dec_opt, bbreg_dec_opt, cls_enc_opt, bbreg_enc_opt`

If you will notice, I converted all the submodules, that predict_cls_bbreg_filters_parallel uses into TensorRT. When setting the values in the beginning of the code to True, all modules run in TensorRT. EVERYTHING IS OK, the accuracy is like it should be

BUT If i convert the entire FilterPredictor module to TensorRT, I get terrible accuracy issues!

I debugged the model with polygraphy, debug reduced with bisect mode and got the onnx model reduced to the part where accuracy fails:

I further reduced the model with linear mode and got this:

the polygraphy specifically reduced the model that inside a transformer, in particular torch.nn.MultiHeadAttention (in onnx graph multihead_attn) is the part where the accuracy fails. This is defined within pytorch. This is not some random custom model, made by somebody, so it is weird that the accuracy would fail here.

CAN ANYBODY EXPLAIN WHAT IS HAPPENING?...

If I convert ONLY the all the SUBMODULES to TensorRT, the ACCURACY IS OK, but if I convert the ENTIRE model the accuracy is BAD...

Perhaps it is the tensor manipulations within predict_cls_bbreg_filters_parallel that cause the accuracy errors?

Thank you for your help

Environment

TensorRT Version: 8.6.1

NVIDIA GPU: GTX 1660 Ti

NVIDIA Driver Version: 546.01

CUDA Version: 12.1 update 1

CUDNN Version: 8.9.7

Operating System:

Python Version (if applicable): 3.10.13

PyTorch Version (if applicable): 2.1.2+cu121

Baremetal or Container (if so, version): No environment, run straight on Windows 10

Relevant Files

link to onnx model: https://drive.google.com/file/d/1-7r0AE_33kJK0KfzkZH-5_l8B8RL-uem/view?usp=sharing link to polygraphy surgeon sanitized onnx model: https://drive.google.com/file/d/1KbRR8dMXrLt_-durnIuS6vuByZHIdFAb/view?usp=sharing link to polygraphy debug reduced model: https://drive.google.com/file/d/16h5zNU6lqBKUYb85Y2bE5-gMQXhNsJ07/view?usp=sharing

tp-nan commented 9 months ago

Based on my experience, you may need to set all Softmax layers to fp32. python:

https://github.com/NVIDIA/TensorRT/blob/3d97932454aa38d140c28f044d02b635b5c6ed80/samples/python/efficientdet/build_engine.py#L189

c++:

https://github.com/torchpipe/torchpipe/blob/072a93df4216e27cdf874746e243afaafef45fa5/torchpipe/csrc/backend/src_cuda/tensorrt_utils.cpp#L146

https://torchpipe.github.io/docs/backend-reference/torch?_highlight=tensorrtt#tensorrttensor

tp-nan commented 9 months ago

The transformer_op17.engine was in fp16 or fp32?

ninono12345 commented 9 months ago

@tp-nan for now I always do fp32, because fp16 returns NaN

ninono12345 commented 9 months ago

My question is because all the models that are run inside predict_cls_bbreg_filters_parallel seem to give out good accuracy, maybe there is some specific function that tensorrt doesn't support very well?

ninono12345 commented 9 months ago

UPDATE. EVERYTHING, ALL ACCURACY ISSUES GOT FIXED WHEN I FIRST TRANSFERED TENSORS TO CPU BEFOR INFERENCING THEM WITH POLYGRAPHY

tp-nan commented 9 months ago

just an afterthought, onnxsim can be used to reduce the model; For fp16, you may need to make Softmax layer runing in fp32

NVIDIA / TensorRT