Media-Smart / vedastr

A scene text recognition toolbox based on PyTorch
Apache License 2.0
535 stars 100 forks source link

WARNING: The shape inference of prim::Constant type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function. #88

Open choozhenbo opened 2 years ago

choozhenbo commented 2 years ago

From ./anaconda3/envs/vedastr/lib/python3.6/site-packages/torch/nn/functional.py, I modified adaptive_avg_pool2d to: def adaptive_avg_pool2d(input: Tensor, output_size: BroadcastingList2[int]) -> Tensor: r""" Applies a 2D adaptive average pooling over an input signal composed of several input planes.

See :class:`~torch.nn.AdaptiveAvgPool2d` for details and output shape.

Args:
    output_size: the target output size (single integer or
        double-integer tuple)
"""
if has_torch_function_unary(input):
    return handle_torch_function(adaptive_avg_pool2d, (input,), input, output_size)
batch, channel, height, width=input.size() ###modified
output_shape=[int(x) for x in input.size()[2:]]###modified
print(output_shape)###modified
print("hha",batch,channel,height,width)###modified
if torch.is_tensor(height): ###modified
    height=input.size()[2].item() ###modified
    width=input.size()[3].item() ###modified
#    print('SINI',height,width) ###modified
_output_size = _list_with_default(output_size, input.size())
return torch._C._nn.adaptive_avg_pool2d(input, output_shape) ##modified

Since I am converting resnet_bilstm_ctc model into ONNX format, the warning seems can be ignored or can be solved using opset_version=11. Somehow my case using opset_version=11 also cannot solve this issue. After I implemented the model and I get different result as the wrong output dimension is [2, 256]. While the right output dimension is [26,37]. Can anybody help me to solve it? My configuration file of the model:

1. deploy

size = (32, 100) mean, std = 0.5, 0.5

sensitive = False character = '0123456789abcdefghijklmnopqrstuvwxyz' batch_max_length = 25

F = 20 hidden_dim = 256 norm_cfg = dict(type='BN') num_class = len(character) + 1 num_steps = batch_max_length + 1

deploy = dict( transform=[ dict(type='Sensitive', sensitive=sensitive, need_character=character), dict(type='ToGray'), dict(type='Resize', size=size), dict(type='Normalize', mean=mean, std=std), dict(type='ToTensor'), ], converter=dict( type='CTCConverter', character=character, batch_max_length=batch_max_length, ), model=dict( type='GModel', need_text=False, body=dict( type='GBody', pipelines=[ dict( type='FeatureExtractorComponent', from_layer='input', to_layer='cnn_feat', arch=dict( encoder=dict( backbone=dict( type='GResNet', layers=[ ('conv', dict(type='ConvModule', in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1, norm_cfg=norm_cfg)), ('conv', dict(type='ConvModule', in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1, norm_cfg=norm_cfg)), ('pool', dict(type='MaxPool2d', kernel_size=2, stride=2, padding=0)), ('block', dict(block_name='BasicBlock', planes=128, blocks=1, stride=1)), ('conv', dict(type='ConvModule', in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1, norm_cfg=norm_cfg)), ('pool', dict(type='MaxPool2d', kernel_size=2, stride=2, padding=0)), ('block', dict(block_name='BasicBlock', planes=256, blocks=2, stride=1)), ('conv', dict(type='ConvModule', in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1, norm_cfg=norm_cfg)), ('pool', dict(type='MaxPool2d', kernel_size=2, stride=(2, 1), padding=(0, 1))), ('block', dict(block_name='BasicBlock', planes=512, blocks=5, stride=1)), ('conv', dict(type='ConvModule', in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1, norm_cfg=norm_cfg)), ('block', dict(block_name='BasicBlock', planes=512, blocks=3, stride=1)), ('conv', dict(type='ConvModule', in_channels=512, out_channels=512, kernel_size=2, stride=(2, 1), padding=(0, 1), norm_cfg=norm_cfg)), ('conv', dict(type='ConvModule', in_channels=512, out_channels=512, kernel_size=2, stride=1, padding=0, norm_cfg=norm_cfg)), ], ), ), collect=dict(type='CollectBlock', from_layer='c4'), ), ), dict( type='SequenceEncoderComponent', from_layer='cnn_feat', to_layer='rnn_feat', arch=dict( type='RNN', input_pool=dict(type='AdaptiveAvgPool2d', output_size=(1, None)), layers=[ ('rnn', dict(type='LSTM', input_size=512, hidden_size=256, bidirectional=True, batch_first=True)), ('fc', dict(type='Linear', in_features=512, out_features=256)), ('rnn', dict(type='LSTM', input_size=256, hidden_size=256, bidirectional=True, batch_first=True)), ('fc', dict(type='Linear', in_features=512, out_features=256)), ], ), ), ], ), head=dict( type='CTCHead', from_layer='rnn_feat', num_class=num_class, in_channels=256, pool=dict( type='AdaptiveAvgPool2d', output_size=(1, None), ), ), ), )