Open corleonechensiyu opened 1 year ago
1.系统环境:Ubuntu 22.04.2 2.MegEngine版本:1.13.0 3.python版本:3.10.12 4.模型名称:atss_res18_coco_3x_800size
1.训练
python official/vision/detection/tools/train.py -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -n 1 -d data/coco/
2.convert
python convert.py -f official/vision/detection/configs/atss_res18_coco_3x_800size.py -w log-of-atss_res18_coco_3x_800size/epoch_9.pkl -i official/assets/cat.jpg
3.
**##转换代码** import numpy as np import megengine.functional as F import megengine.hub from megengine import jit, tensor import megengine as mge import megengine.distributed as dist from megengine.autodiff import GradManager from megengine.data import DataLoader, Infinite, RandomSampler from megengine.data import transform as T from megengine.optimizer import SGD from official.vision.detection.tools.data_mapper import data_mapper from official.vision.detection.tools.utils import DetEvaluator, import_from_file import megengine.traced_module as tm import argparse import bisect import copy import os import time import cv2 def make_parser(): parser = argparse.ArgumentParser() parser.add_argument( "-f", "--file", default="net.py", type=str, help="net description file" ) parser.add_argument( "-w", "--weight_file", default=None, type=str, help="weights file", ) parser.add_argument("-i", "--image", type=str) return parser if __name__ == "__main__": parser = make_parser() args = parser.parse_args() current_network = import_from_file(args.file) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) ori_img = cv2.imread(args.image) image, im_info = DetEvaluator.process_inputs( ori_img.copy(), model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) state_dict = mge.load(args.weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) model.eval() traced_resnet = tm.trace_module(model, mge.tensor(image),im_info=mge.tensor(im_info)) # 可以在这里进行基于 trace_module 的图手术,以及模型转换 traced_resnet.eval() mge.save(traced_resnet,"test.tm") @jit.trace(symbolic=True, capture_as_const=True) def infer_func(data, im_info, model): pred = model(data,im_info) return pred output = infer_func(mge.tensor(image),im_info=mge.tensor(im_info), model=traced_resnet) infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"])
25 17:37:50[mgb] WRN [dnn] Cudnn8 will jit ptx code with cache. You can set CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache` 25 17:37:53[mgb] ERR error while applying optimization pass PassConvertToCompatible: bad input shape for polyadic operator: {256}, {1,256,136,100} backtrace: /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292] Traceback (most recent call last): File "/home/csy/megvii/Models/convert.py", line 63, in <module> infer_func.dump("log-of-atss_res18_coco_3x_800size/test.mge", arg_names=["data"]) File "/home/csy/.local/lib/python3.10/site-packages/megengine/jit/tracing.py", line 1183, in dump dump_content, dump_info = G.dump_graph( File "/home/csy/.local/lib/python3.10/site-packages/megengine/core/tensor/megbrain_graph.py", line 456, in dump_graph dump_content = _imperative_rt.dump_graph( RuntimeError: bad input shape for polyadic operator: {256}, {1,256,136,100} backtrace: /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb13MegBrainErrorC1ERKSs+0x4a) [0x7fdfdad5590a] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2db7867) [0x7fdfdadb7867] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN6megdnn12ErrorHandler15on_megdnn_errorERKSs+0x14) [0x7fdfde9d86a4] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a078d8) [0x7fdfdea078d8] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x6a07a91) [0x7fdfdea07a91] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb3opr8Elemwise20get_output_var_shapeEN6megdnn5param8Elemwise4ModeERKNS2_11SmallVectorINS2_11TensorShapeELj4EEE+0x37) [0x7fdfdaf150e7] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZNK3mgb3opr8Elemwise20get_output_var_shapeERKN6megdnn11SmallVectorINS2_11TensorShapeELj4EEERS5_+0x29) [0x7fdfdaf1e0c9] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg5mixin24OutshapePureByInshapeOpr10infer_descEmRN6megdnn11TensorShapeERKNS0_12static_infer6InpValE+0x19d) [0x7fdfdade9e0d] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(_ZN3mgb2cg12static_infer22StaticInferManagerImpl13TagShapeTrait8do_inferERKNS1_6InpValE+0x57) [0x7fdfdae0b437] /home/csy/.local/lib/python3.10/site-packages/megengine/core/lib/libmegengine_shared.so(+0x2e0b292) [0x7fdfdae0b292]
@FateScript 大佬帮忙看看
环境
1.系统环境:Ubuntu 22.04.2 2.MegEngine版本:1.13.0 3.python版本:3.10.12 4.模型名称:atss_res18_coco_3x_800size
复现步骤
1.训练
2.convert
3.
请提供关键的代码片段便于追查问题
请提供完整的日志及报错信息