MegEngine / Models

采用MegEngine实现的各种主流深度学习模型
Other
302 stars 101 forks source link

大佬求帮助,在训练atss检测的时候,按照官方dump成mge出错,.tm模型不能可视化 #126

Open corleonechensiyu opened 1 year ago

corleonechensiyu commented 1 year ago

环境

1.系统环境:Ubuntu 22.04.2 2.MegEngine版本:1.13.0 3.python版本:3.10.12 4.模型名称:atss_res18_coco_3x_800size

复现步骤

1.训练

python  official/vision/detection/tools/train.py    -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -n 1 -d data/coco/

2.convert

python convert.py -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -w log-of-atss_res18_coco_3x_800size/epoch_9.pkl -i official/assets/cat.jpg

3.

请提供关键的代码片段便于追查问题

**##转换代码**
import numpy as np
import megengine.functional as F
import megengine.hub
from megengine import jit, tensor
import megengine as mge
import megengine.distributed as dist
from megengine.autodiff import GradManager
from megengine.data import DataLoader, Infinite, RandomSampler
from megengine.data import transform as T
from megengine.optimizer import SGD

from official.vision.detection.tools.data_mapper import data_mapper
from official.vision.detection.tools.utils import DetEvaluator, import_from_file
import megengine.traced_module as tm
import argparse
import bisect
import copy
import os
import time
import cv2

def make_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", default="net.py", type=str, help="net description file"
    )
    parser.add_argument(
        "-w", "--weight_file", default=None, type=str, help="weights file",
    )
    parser.add_argument("-i", "--image", type=str)
    return parser

if __name__ == "__main__":
    parser = make_parser()
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()
    cfg.backbone_pretrained = False
    model = current_network.Net(cfg)

    ori_img = cv2.imread(args.image)
    image, im_info = DetEvaluator.process_inputs(
        ori_img.copy(), model.cfg.test_image_short_size, model.cfg.test_image_max_size,
    )
    state_dict = mge.load(args.weight_file)
    if "state_dict" in state_dict:
        state_dict = state_dict["state_dict"]
    model.load_state_dict(state_dict)
    model.eval()

    traced_resnet = tm.trace_module(model, mge.tensor(image),im_info=mge.tensor(im_info))
    # 可以在这里进行基于 trace_module 的图手术,以及模型转换
    traced_resnet.eval()
    mge.save(traced_resnet,"test.tm")
    @jit.trace(symbolic=True, capture_as_const=True)
    def infer_func(data, im_info, model):
        pred = model(data,im_info)
        return pred

    output = infer_func(mge.tensor(image),im_info=mge.tensor(im_info), model=traced_resnet)
    infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"])

请提供完整的日志及报错信息

25 17:37:50[mgb] WRN [dnn]
            Cudnn8 will jit ptx code with cache. You can set
            CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
            For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`
25 17:37:53[mgb] ERR error while applying optimization pass PassConvertToCompatible: bad input shape for polyadic operator: {256}, {1,256,136,100}

backtrace:
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292]

Traceback (most recent call last):
  File "/home/csy/megvii/Models/convert.py", line 63, in <module>
    infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"])
  File "/home/csy/.local/lib/python3.10/site-packages/megengine/jit/tracing.py", line 1183, in dump
    dump_content, dump_info = G.dump_graph(
  File "/home/csy/.local/lib/python3.10/site-packages/megengine/core/tensor/megbrain_graph.py", line 456, in dump_graph
    dump_content = _imperative_rt.dump_graph(
RuntimeError: bad input shape for polyadic operator: {256}, {1,256,136,100}

backtrace:
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437]
/home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292]
corleonechensiyu commented 1 year ago

73FA1F41-2486-4db3-BBC3-8AC824AE2EFD

corleonechensiyu commented 1 year ago

@FateScript 大佬帮忙看看