Open swandxmr opened 7 months ago
进行单机双卡训练时,不定时间异常中止,进行排查发现训练过程中内存泄漏。
使用代码:
import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist import paddlex as pdx from paddlex import transforms as T def train_1(model, train_dataset, eval_dataset): # print(paddle.distributed.is_initialized(),'------------') # # 1. 初始化并行训练环境 # 注释paddlex/__init__.py中的init_parallel_env() dist.init_parallel_env() # model = pdx.det.PPYOLOv2(num_classes=num_classes, backbone='ResNet50_vd_dcn') model.train( num_epochs=100, train_dataset=train_dataset, train_batch_size=4, eval_dataset=eval_dataset, pretrain_weights='IMAGENET', learning_rate=0.005 / 12, warmup_steps=1000, warmup_start_lr=0.0, lr_decay_epochs=[105, 135, 150], save_interval_epochs=5, save_dir='output/ppyolov2_r50vd_dcn') if __name__ == '__main__': dataset = 'https://bj.bcebos.com/paddlex/datasets/insect_det.tar.gz' pdx.utils.download_and_decompress(dataset, path='./') print(dist.get_rank(),'----------------get_rank') train_transforms = T.Compose([ T.MixupImage(mixup_epoch=-1), T.RandomDistort(), T.RandomExpand(im_padding_value=[123.675, 116.28, 103.53]), T.RandomCrop(), T.RandomHorizontalFlip(), T.BatchRandomResize( target_sizes=[ 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768 ], interp='RANDOM'), T.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) eval_transforms = T.Compose([ T.Resize( target_size=640, interp='CUBIC'), T.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # 定义训练和验证所用的数据集 # API说明:https://github.com/PaddlePaddle/PaddleX/blob/release/2.0.0/docs/apis/datasets.md train_dataset = pdx.datasets.VOCDetection( data_dir='insect_det', file_list='insect_det/train_list.txt', label_list='insect_det/labels.txt', transforms=train_transforms, shuffle=True) eval_dataset = pdx.datasets.VOCDetection( data_dir='insect_det', file_list='insect_det/val_list.txt', label_list='insect_det/labels.txt', transforms=eval_transforms, shuffle=False) num_classes = len(train_dataset.labels) import os os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' model = pdx.det.PPYOLOv2(num_classes=num_classes, backbone='ResNet50_vd_dcn') dist.spawn(train_1, gpus='0,1', args=(model, train_dataset, eval_dataset))
同时注释了paddlex/init.py中的init_parallel_env()
PaddlePaddle :v2.5.1
PaddleX:2.1.0
操作系统:ubuntu20.04
请问您使用的Python版本是? python3.10
请问您使用的CUDA/cuDNN的版本号是? cuda:11.2.2
欢迎尝试使用PaddleX新版本,看看您的问题是否可以解决?https://aistudio.baidu.com/intro/paddlex
进行单机双卡训练时,不定时间异常中止,进行排查发现训练过程中内存泄漏。
使用代码:
同时注释了paddlex/init.py中的init_parallel_env()
环境
PaddlePaddle :v2.5.1
PaddleX:2.1.0
操作系统:ubuntu20.04
请问您使用的Python版本是? python3.10
请问您使用的CUDA/cuDNN的版本号是? cuda:11.2.2