PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.23k stars 5.58k forks source link

FatalError: `Process abort signal` is detected by the operating system. #67506

Open SunNoJJ opened 2 months ago

SunNoJJ commented 2 months ago

bug描述 Describe the Bug

模型代码

"""
@项目名称:FPN.py
@作   者:陆地起飞全靠浪
@创建日期:2022-07-05-16:27
"""
import paddle
from paddle import nn
import paddle.nn.functional as F
from Networks.ConvolutionalBlockAttentionModule import CBAM

class ConvBnReLU(nn.Layer):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(ConvBnReLU, self).__init__()
        self.conv = nn.Conv2D(in_channels, out_channels, kernel_size, stride, padding, bias_attr=False)
        self.bn = nn.BatchNorm2D(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return F.relu(x)

class FeaturePyramidNetwork(nn.Layer):
    def __init__(self, num_classes, base_out_channels=16):
        super(FeaturePyramidNetwork, self).__init__()
        self.num_classes = num_classes
        self.conv0 = ConvBnReLU(2, base_out_channels, 3, 1, 1)
        self.conv1 = ConvBnReLU(base_out_channels, base_out_channels, 3, 1, 1)

        self.conv2 = ConvBnReLU(base_out_channels, base_out_channels * 2, 5, 2, 2)
        self.conv3 = ConvBnReLU(base_out_channels * 2, base_out_channels * 2, 3, 1, 1)
        self.conv4 = ConvBnReLU(base_out_channels * 2, base_out_channels * 2, 3, 1, 1)

        self.conv5 = ConvBnReLU(base_out_channels * 2, base_out_channels * 4, 5, 2, 2)
        self.conv6 = ConvBnReLU(base_out_channels * 4, base_out_channels * 4, 3, 1, 1)
        self.conv7 = ConvBnReLU(base_out_channels * 4, base_out_channels * 4, 3, 1, 1)

        self.conv8 = ConvBnReLU(base_out_channels * 4, base_out_channels * 8, 5, 2, 2)
        self.conv9 = ConvBnReLU(base_out_channels * 8, base_out_channels * 8, 3, 1, 1)
        self.conv10 = ConvBnReLU(base_out_channels * 8, base_out_channels * 8, 3, 1, 1)

        self.inner1 = nn.Conv2D(base_out_channels * 4, base_out_channels * 8, 1)
        self.inner2 = nn.Conv2D(base_out_channels * 2, base_out_channels * 8, 1)
        self.inner3 = nn.Conv2D(base_out_channels * 1, base_out_channels * 8, 1)

        self.out1 = nn.Conv2D(base_out_channels * 8, base_out_channels * 8, 1, bias_attr=False)
        self.out2 = nn.Conv2D(base_out_channels * 8, base_out_channels * 4, 1, bias_attr=False)
        self.out3 = nn.Conv2D(base_out_channels * 8, base_out_channels * 2, 1, bias_attr=False)
        self.out4 = nn.Conv2D(base_out_channels * 8, base_out_channels, 1, bias_attr=False)

        self.conv2d = nn.Conv2D(in_channels=3, out_channels=3, kernel_size=[1, 9])
        self.flatten = nn.Flatten(1, -1)
        self.linear_stage1 = nn.Linear(327680, self.num_classes * 10)  # in out
        self.linear_stage2 = nn.Linear(655360, self.num_classes * 10)  # in out
        self.linear_stage3 = nn.Linear(1310720, self.num_classes * 10)  # in out
        self.linear_stage4 = nn.Linear(2621440, self.num_classes * 10)  # in out
        self.linear_concat = nn.Linear(self.num_classes * 10 * 4, self.num_classes * 10 * 4)  # in out
        self.linear_out = nn.Linear(self.num_classes * 10 * 4, self.num_classes)  # in out
        self.softmax = nn.Softmax()  # 使得该行的每个元素在 [0,1] 范围内,并且总和为 1;
        # CBAM(in_channels=24, kernel_size=3)
        self.cbam16 = CBAM(base_out_channels)
        self.cbam32 = CBAM(base_out_channels * 2)
        self.cbam64 = CBAM(base_out_channels * 4)
        self.cbam128 = CBAM(base_out_channels * 8)
        paddle.seed(1000)
        self.emb = nn.Embedding(500, 512, sparse=True)

    def forward(self, x):  # B 3,960,640
        x = self.emb(x)
        conv1 = self.conv1(self.conv0(x))
        conv1 = self.cbam16(conv1)

        conv4 = self.conv4(self.conv3(self.conv2(conv1)))
        conv4 = self.cbam32(conv4)

        conv7 = self.conv7(self.conv6(self.conv5(conv4)))
        conv7 = self.cbam64(conv7)

        intra_feat = self.conv10(self.conv9(self.conv8(conv7)))
        intra_feat = self.cbam128(intra_feat)
        stage1 = self.out1(intra_feat)

        intra_feat = F.interpolate(intra_feat, scale_factor=2, mode="nearest") + self.inner1(conv7)
        intra_feat = self.cbam128(intra_feat)
        stage2 = self.out2(intra_feat)

        intra_feat = F.interpolate(intra_feat, scale_factor=2, mode="nearest") + self.inner2(conv4)
        intra_feat = self.cbam128(intra_feat)
        stage3 = self.out3(intra_feat)

        intra_feat = F.interpolate(intra_feat, scale_factor=2, mode="nearest") + self.inner3(conv1)
        intra_feat = self.cbam128(intra_feat)
        stage4 = self.out4(intra_feat)

        stage1 = self.flatten(stage1)
        stage2 = self.flatten(stage2)
        stage3 = self.flatten(stage3)
        stage4 = self.flatten(stage4)

        stage1 = self.linear_stage1(stage1)
        stage2 = self.linear_stage2(stage2)
        stage3 = self.linear_stage3(stage3)
        stage4 = self.linear_stage4(stage4)

        concat = self.linear_concat(paddle.concat([stage1, stage2, stage3, stage4], axis=1))
        for i in range(7):
            concat = self.linear_concat(concat)
            if i % 3 == 0:
                concat = F.relu(concat)
        output = self.linear_out(concat)
        return self.softmax(output)

调用代码

import paddle
from Networks.FPN320 import FeaturePyramidNetwork as FPN

beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")

model = FPN(num_classes=2)
model.train()
opt = paddle.optimizer.AdamW(learning_rate=0.1,
                             parameters=model.parameters(),
                             beta1=beta1,
                             beta2=beta2,
                             weight_decay=0.01
                             )
img_c, img_h = [2, 320]
batch = 2
x = paddle.ones((batch, img_c, img_h)).astype('int32')

out = model(x)  # batch=4

loss = paddle.mean(out)
loss.backward()
opt.step()
opt.clear_grad()

报错信息

W0816 14:40:19.312968 203446 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.6, Driver API Version: 12.4, Runtime API Version: 11.7
W0816 14:40:19.316589 203446 gpu_resources.cc:164] device: 0, cuDNN Version: 8.9.
/sdb2/Software/andconda3Install/envs/base39/lib/python3.9/site-packages/paddle/nn/layer/norm.py:824: UserWarning: When training, we now always track global mean and variance.
  warnings.warn(
terminate called after throwing an instance of 'common::PD_Exception'
  what():  Unimplemented error. Invalid dimension to be accessed. Now only supports access to dimension 0 to 9, but received dimension is 771776311.
  [../paddle/common/ddim.h:114]

--------------------------------------
C++ Traceback (most recent call last):
--------------------------------------
0   paddle::pybind::eager_api_adamw_(_object*, _object*, _object*)
1   adamw__ad_func(paddle::Tensor&, paddle::Tensor const&, paddle::Tensor const&, paddle::Tensor&, paddle::Tensor&, paddle::Tensor&, paddle::Tensor&, paddle::optional<paddle::Tensor>&, paddle::optional<paddle::Tensor> const&, paddle::experimental::ScalarBase<paddle::Tensor>, paddle::experimental::ScalarBase<paddle::Tensor>, paddle::experimental::ScalarBase<paddle::Tensor>, float, float, bool, bool, long, bool, bool)
2   paddle::experimental::adamw_(paddle::Tensor&, paddle::Tensor const&, paddle::Tensor const&, paddle::Tensor&, paddle::Tensor&, paddle::Tensor&, paddle::Tensor&, paddle::optional<paddle::Tensor>&, paddle::optional<paddle::Tensor> const&, paddle::experimental::ScalarBase<paddle::Tensor> const&, paddle::experimental::ScalarBase<paddle::Tensor> const&, paddle::experimental::ScalarBase<paddle::Tensor> const&, float, float, bool, bool, long, bool, bool)
3   paddle::experimental::PrepareData(paddle::Tensor const&, phi::TensorArgDef const&, paddle::experimental::TransformFlag const&, bool)
4   phi::DenseTensorMeta::is_contiguous() const
5   phi::DenseTensorMeta::calc_strides(common::DDim const&)

----------------------
Error Message Summary:
----------------------
FatalError: `Process abort signal` is detected by the operating system.
  [TimeInfo: *** Aborted at 1723790420 (unix time) try "date -d @1723790420" if you are using GNU date ***]
  [SignalInfo: *** SIGABRT (@0x3e800031ab6) received by PID 203446 (TID 0x73c2ee532440) from PID 203446 ***]

Process finished with exit code 134 (interrupted by signal 6:SIGABRT)

paddle信息

paddle2onnx                  1.2.4
paddlefsl                    1.1.0
paddlehub                    2.4.0
paddlenlp                    2.8.0
paddleocr                    2.7.3
paddlepaddle-gpu             2.6.1.post117

报错原因位置

paddle.optimizer.AdamW

系统信息(本地电脑和服务器都不行)

6.5.0-28-generic
Linux ltl-LEGION-REN7000K-26IAB 6.5.0-28-generic #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2 x86_64 x86_64 x86_64 GNU/Linux
No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 22.04.1 LTS
Release:    22.04
Codename:   jammy
Linux version 6.5.0-28-generic (buildd@lcy02-amd64-098) (x86_64-linux-gnu-gcc-12 (Ubuntu 12.3.0-1ubuntu1~22.04) 12.3.0, GNU ld (GNU Binutils for Ubuntu) 2.38) #29~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Apr  4 14:39:20 UTC 2

其他补充信息 Additional Supplementary Information

No response

ronny1996 commented 2 months ago

你好看起来是一个bug,可以尝试 export FLAGS_use_stride_kernel=0 或者升级 paddle 版本

SunNoJJ commented 2 months ago

你好看起来是一个bug,可以尝试 export FLAGS_use_stride_kernel=0 或者升级 paddle 版本 paddlepaddle-gpu 2.6.1.post117 已经是最新的了, export FLAGS_use_stride_kernel=0 没有用

ronny1996 commented 2 months ago

你好,可以打开 export GLOG_v=10 看一下是哪个tensor的维度超过了9

或者尝试升级 3.0.beta 看下能否解决

python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/