Closed Nefefilibata closed 6 months ago
模型是PVT-V2。recompute在单卡下可以正常训练,但是在四卡并行会报如上错误。四卡下关闭recompute能正常训练。 在模型上只修改了这部分 def forward_features(self, x): B = x.shape[0]
for i in range(self.num_stages):
patch_embed = getattr(self, f"patch_embed{i + 1}")
block = getattr(self, f"block{i + 1}")
norm = getattr(self, f"norm{i + 1}")
x, H, W = patch_embed(x)
# x, H, W = recompute(patch_embed,x)
for blk in block:
x = recompute(blk,x,H,W)
# x = blk(x,H,W)
x = norm(x)
if i != self.num_stages - 1:
x = x.reshape([B, H, W, -1]).transpose([0, 3, 1, 2])
return x.mean(axis=1)
def forward(self, x): x = self.forward_features(x) x = self.head(x) return x 即在原有的forward_features上加入了recompute
@haohongxiang
补充一下训练代码 ########################
from paddle.distributed.sharding import group_sharded_parallel, save_group_sharded_model fleet.init(is_collective=True) group = paddle.distributed.new_group([0,1,2,3]) use_pure_fp16 = True clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) def train2():
# model = ResNet(BasicBlock, 18)
model = PyramidVisionTransformerV2(
patch_size=4, embed_dims=[64, 128, 320, 512],num_classes=11, num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True,
norm_layer=partial(nn.LayerNorm, epsilon=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], linear=True)
model = paddle.DataParallel(model,find_unused_parameters=True)
# 配置loss函数
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
scaler = paddle.amp.GradScaler(init_loss_scaling=2)
optimizer = paddle.optimizer.AdamW(learning_rate=0.001, parameters=model.parameters(), weight_decay=0.00001, grad_clip=clip)
# )
model, optimizer, scaler = group_sharded_parallel(model, optimizer, "os_g", scaler=scaler)
# 四、构建分布式训练使用的优化器
# optimizer = fleet.distributed_optimizer(optimizer)
train_sampler = DistributedBatchSampler(train_dataset, 50, shuffle=True, drop_last=True)
train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=1)
valid_sampler = DistributedBatchSampler(valid_dataset, 50, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_sampler=valid_sampler, num_workers=1)
num_epochs=100
Iters, total_loss, total_acc = [], [], []
model.train()
for epo in range(num_epochs):
for batch_id, data in enumerate(train_loader()):
x_data = data[0]
y_data = data[1]
y_data = paddle.reshape(y_data, [-1,1])
with paddle.amp.auto_cast():
with model.no_sync():
predicts = model(x_data)
loss = nn.functional.softmax_with_cross_entropy(predicts, y_data)
avg_loss = paddle.mean(loss)
if batch_id%1==0:
#打印中间过程
print("epoch: {}, batch_id: {}, loss is: {}".format(epo, batch_id+1, avg_loss.numpy()))
# 更新参数
if 1:
scaler.scale(avg_loss).backward()
fused_allreduce_gradients(list(model.parameters()), None)
scaler.step(optimizer)
scaler.update()
else:
loss.backward()
optimizer.step()
optimizer.clear_grad()
model.eval()
Acc = []
for batch_id, data in enumerate(valid_loader()):
x_data = data[0]
y_data = data[1]
y_data = paddle.reshape(y_data, [-1,1])
with paddle.amp.auto_cast():
predicts = model(x_data)
acc = paddle.metric.accuracy(predicts, y_data)
Acc.append(acc.numpy())
print('acc is {}'.format(sum(Acc)/len(Acc)))
model.train()
save_group_sharded_model(model, 'MODEL', optimizer)
你好,推荐使用https://github.com/PaddlePaddle/PaddleNLP/tree/develop/llm
不加入recompute,单机四卡分组切分并行可以正常训练 但开启recompute后 即在模型中:x = blk(x,H,W) 改为:x = recompute(blk,x,H,W) 训练完第一个epoch后报以下错误: epoch: 1, batch_id: 1, loss is: [2.6346893] Traceback (most recent call last): File "PVT-v2_distribute.py", line 425, in train2() File "PVT-v2_distribute.py", line 402, in train2 scaler.scale(avg_loss).backward() File "", line 2, in backward File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/wrapped_decorator.py", line 26, in impl return wrapped_func(*args, *kwargs) File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/framework.py", line 534, in impl return func(args, **kwargs) File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/varbase_patch_methods.py", line 297, in backward core.eager.run_backward([self], grad_tensor, retain_graph) OSError: (External) RuntimeError: (PreconditionNotMet) Error happened, when parameter[341][linear_79.b_0] has been ready before. Please set find_unused_parameters=True to traverse backward graph in each step to prepare reduce in advance. If you have set, there may be several reasons for this error: 1) In multiple reentrant backward phase, some parameters are reused.2) Using model parameters outside of forward function. Please make sure that model parameters are not shared in concurrent forward-backward passes. [Hint: Expected has_marked_unusedvars == false, but received has_marked_unusedvars:1 != false:0.] (at /paddle/paddle/fluid/distributed/collective/reducer.cc:689)
At: /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/autograd/backward_mode.py(125): backward /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/framework.py(534): impl /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/wrapped_decorator.py(26): impl (2): backward /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/distributed/fleet/recompute/recompute.py(392): backward /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/autograd/py_layer.py(546): backward /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/dygraph/varbase_patch_methods.py(297): backward /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/framework.py(534): impl /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/wrapped_decorator.py(26): impl (2): backward PVT-v2_distribute.py(402): train2 PVT-v2_distribute.py(425): (at /paddle/paddle/fluid/eager/pylayer/py_layer_node.cc:113)