PaddlePaddle / PaddleX

PaddlePaddle End-to-End Development Toolkit(飞桨低代码开发工具)
Apache License 2.0
4.6k stars 905 forks source link

paddlex BML训练报错:(InvalidArgument) yolo_box(): argument 'X' (position 0) must be Tensor, but got Tensor (at /paddle/paddle/fluid/pybind/op_function_common.cc:818) #1744

Open Grubby-Wang opened 6 months ago

Grubby-Wang commented 6 months ago

Checklist:

  1. 查找历史相关issue寻求解答
  2. 翻阅FAQ常见问题汇总和答疑
  3. 确认bug是否在新版本里还未修复
  4. 翻阅PaddleX API文档说明

描述问题

训练过程正常,一个epoch结束,一开始开始验证时就报错:

    `ValueError                                Traceback (most recent call last)
    /tmp/ipykernel_1258/573955976.py in <module>
         68     save_interval_epochs=5,
         69     lr_decay_epochs=[216, 243],
    ---> 70     save_dir='output/yolov3_darknet53')

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/detector.py in train(self, num_epochs, train_dataset, train_batch_size, eval_dataset, optimizer, save_interval_epochs, log_interval_steps, save_dir, pretrain_weights, learning_rate, warmup_steps, warmup_start_lr, lr_decay_epochs, lr_decay_gamma, metric, use_ema, early_stop, early_stop_patience, use_vdl, resume_checkpoint)
        332             early_stop=early_stop,
        333             early_stop_patience=early_stop_patience,
    --> 334             use_vdl=use_vdl)
        335 
        336     def quant_aware_train(self,

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/base.py in train_loop(self, num_epochs, train_dataset, train_batch_size, eval_dataset, save_interval_epochs, log_interval_steps, save_dir, ema, early_stop, early_stop_patience, use_vdl)
        395                         eval_dataset,
        396                         batch_size=eval_batch_size,
    --> 397                         return_details=True)
        398                     # 保存最优模型
        399                     if local_rank == 0:

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/detector.py in evaluate(self, eval_dataset, batch_size, metric, return_details)
        497             with paddle.no_grad():
        498                 for step, data in enumerate(self.eval_data_loader):
    --> 499                     outputs = self.run(self.net, data, 'eval')
        500                     eval_metric.update(data, outputs)
        501                 eval_metric.accumulate()

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/detector.py in run(self, net, inputs, mode)
        103 
        104     def run(self, net, inputs, mode):
    --> 105         net_out = net(inputs)
        106         if mode in ['train', 'eval']:
        107             outputs = net_out

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py in __call__(self, *inputs, **kwargs)
        946             and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
        947             self._build_once(*inputs, **kwargs)
    --> 948             return self.forward(*inputs, **kwargs)
        949         else:
        950             return self._dygraph_call_func(*inputs, **kwargs)

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/architectures/meta_arch.py in forward(self, inputs)
         69             for inp in inputs_list:
         70                 self.inputs = inp
    ---> 71                 outs.append(self.get_pred())
         72 
         73             # multi-scale test

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/architectures/yolo.py in get_pred(self)
        122 
        123     def get_pred(self):
    --> 124         return self._forward()

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/architectures/yolo.py in _forward(self)
        113                     bbox, bbox_num = self.post_process(
        114                         yolo_head_outs, self.yolo_head.mask_anchors,
    --> 115                         self.inputs['im_shape'], self.inputs['scale_factor'])
        116                 output = {'bbox': bbox, 'bbox_num': bbox_num}
        117 

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py in __call__(self, *inputs, **kwargs)
        946             and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
        947             self._build_once(*inputs, **kwargs)
    --> 948             return self.forward(*inputs, **kwargs)
        949         else:
        950             return self._dygraph_call_func(*inputs, **kwargs)

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/post_process.py in forward(self, head_out, rois, im_shape, scale_factor)
         65         """
         66         if self.nms is not None:
    ---> 67             bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
         68             bbox_pred, bbox_num, _ = self.nms(bboxes, score, self.num_classes)
         69         else:

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/layers.py in __call__(self, yolo_head_out, anchors, im_shape, scale_factor, var_weight)
        546                                          self.num_classes, self.conf_thresh,
        547                                          self.downsample_ratio // 2**i,
    --> 548                                          self.clip_bbox, self.scale_x_y)
        549             boxes_list.append(boxes)
        550             scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))

    /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/ppdet/modeling/ops.py in yolo_box(x, origin_shape, anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, name)
        701                  conf_thresh, 'downsample_ratio', downsample_ratio,
        702                  'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y)
    --> 703         boxes, scores = core.ops.yolo_box(x, origin_shape, *attrs)
        704         return boxes, scores
        705     else:

    ValueError: (InvalidArgument) yolo_box(): argument 'X' (position 0) must be Tensor, but got Tensor (at /paddle/paddle/fluid/pybind/op_function_common.cc:818)`

复现

1.找到类似的问题,反馈把paddle版本降低到2.3.2即可,但是降到这个版本后,又会报以下错误: ` AttributeError Traceback (most recent call last) /tmp/ipykernel_263/2175501212.py in 1 import numpy as np ----> 2 import paddlex as pdx 3 from paddlex import transforms as T 4 5 # 定义训练和验证时的transforms

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/__init__.py in <module>
       18 init_parallel_env()
       19 
  ---> 20 from . import cv
       21 from . import seg
       22 from . import cls
  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/__init__.py in <module>
       13 # limitations under the License.
  ---> 15 from . import models
       16 from . import transforms
       17 from . import datasets

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/__init__.py in <module>
       13 # limitations under the License.
       14 
  ---> 15 from .segmenter import *
       16 from .classifier import *
       17 from .detector import *

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/segmenter.py in <module>
       26 from paddlex.utils import get_single_card_bs, DisablePrint
       27 import paddlex.utils.logging as logging
  ---> 28 from .base import BaseModel
       29 from .utils import seg_metrics as metrics
       30 from paddlex.utils.checkpoint import seg_pretrain_weights_dict

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddlex/cv/models/base.py in <module>
       23 import paddle
       24 from paddle.io import DataLoader, DistributedBatchSampler
  ---> 25 from paddleslim import QAT
       26 from paddleslim.analysis import flops
       27 from paddleslim import L1NormFilterPruner, FPGMFilterPruner

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/__init__.py in <module>
       15 from __future__ import absolute_import
       16 from paddleslim import models
  ---> 17 from paddleslim import prune
       18 from paddleslim import nas
       19 from paddleslim import analysis

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/prune/__init__.py in <module>
       14 
       15 from __future__ import absolute_import
  ---> 16 from .pruner import *
       17 from ..prune import pruner
       18 from .auto_pruner import *

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/prune/pruner.py in <module>
       18 import numpy as np
       19 from functools import reduce
  ---> 20 from ..core import VarWrapper, OpWrapper, GraphWrapper
       21 from .collections import StaticPruningCollections
       22 from .criterion import CRITERION

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/core/__init__.py in <module>
       17 from ..core import registry
       18 from .registry import *
  ---> 19 from ..core import dygraph
       20 from .dygraph import *
       21 

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/core/dygraph.py in <module>
        7 from paddle.fluid.dygraph.layers import Layer
        8 from paddle.fluid.framework import Block, ParamBase, Program, Variable
  ----> 9 from ..common import get_logger
       10 
       11 __all__ = ["dygraph2program"]

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/common/__init__.py in <module>
       19 from .lock import lock, unlock
       20 from .cached_reader import cached_reader
  ---> 21 from .server import Server
       22 from .client import Client
       23 from .meter import AvgrageMeter

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/common/server.py in <module>
       26 import threading
       27 from .log_helper import get_logger
  ---> 28 from .rl_controller.utils import add_grad, ConnectMessage_logger = get_logger(__name__, level=logging.INFO)

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddleslim/common/rl_controller/__init__.py in <module>
       17 _logger = get_logger(__name__, level=logging.INFO)
       18 try:
  ---> 19     import parl
       20     from .ddpg import *
       21 except ImportError as e:

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/parl/__init__.py in <module>
       45         from parl.core.torch import *
       46 
  ---> 47 from parl.remote import remote_class, connect
       48 from parl import algorithms

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/parl/remote/__init__.py in <module>
       17 from parl.remote.client import *
       18 from parl.remote.exceptions import *
  ---> 19 from parl.remote.remote_decorator import *

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/parl/remote/remote_decorator.py in <module>
       **17** 
       18 from parl.utils import logger
  ---> 19 from parl.remote.remote_wrapper import RemoteWrapper
       20 from parl.remote.proxy_wrapper import proxy_wrapper_func
       21 from parl.remote.future_mode import proxy_wrapper_nowait_func

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/parl/remote/remote_wrapper.py in <module>
       19 
       20 from parl.utils import logger, to_str, to_byte
  ---> 21 from parl.remote.communication import loads_argument, loads_return,\
       22     dumps_argument, dumps_return
       23 from parl.remote.client import get_global_client

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/parl/remote/communication.py in <module>
       36         return val
  ---> 38     context = pyarrow.default_serialization_context()
       40     # support deserialize in another environment

  /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/pyarrow/__init__.py in __getattr__(name)
      315 
      316     raise AttributeError(
  --> 317         "module 'pyarrow' has no attribute '{0}'".format(name)
      318     )
      319 

  AttributeError: module 'pyarrow' has no attribute 'default_serialization_context'`
  1. 再次升级paddle到2.5.0以上,在pip install paddlex过程中,又卡着动不了: image

  2. 实在让人崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!崩溃!!!

环境

  1. PaddlePaddle 2.4.0, PaddleX 2.1.0

  2. BLM云训练

  3. Python: 3.7.4

  4. cuDNN Version: 8.2.

dyning commented 5 months ago

欢迎尝试使用新版本试试,https://aistudio.baidu.com/intro/paddlex