Closed Ldpe2G closed 2 years ago
CIFAR100
Framework | Init Method | Version | Epoch | Avg Time for Each Epoch | Acc@1 | Log |
---|---|---|---|---|---|---|
oneflow eager global 2d sbp | 随机初始化 | 基于 fix-zeros_like_sbp 分支加上 https://github.com/Oneflow-Inc/oneflow/pull/7225 的修改 | 300 | ~2m30s | 78.12% | log |
Timeout error at 300 seconds.
train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
File "main.py", line 157, in train_one_epoch
outputs = model(samples)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 685, in forward
x = self.forward_features(x)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 677, in forward_features
x = layer(x)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 540, in forward
x = blk(x)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 368, in forward
x = shortcut + self.drop_path(x)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 46, in forward
return drop_path(x, self.drop_prob, self.training)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 29, in drop_path
random_tensor = flow.rand(*shape, dtype=x.dtype, placement=x.placement, sbp=x.sbp) + keep_prob
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/random_ops.py", line 137, in rand_op
requires_grad=requires_grad,
oneflow._oneflow_internal.exception.TimeoutException:
File "../oneflow/core/functional/impl/random_functor.cpp", line 135, in operator()
OpInterpUtil::Dispatch<Tensor>( *op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, distribution_state))
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
Dispatch<TensorTuple>(op_expr, inputs, ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
Dispatch(op_expr, inputs, outputs.get(), ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
internal_->Apply(op_expr, inputs, outputs, ctx)
File "../oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp", line 99, in Interpret
MetaInfoConsistencyCheck(parallel_desc, ctx.nd_sbp)
File "../oneflow/core/framework/consistency_check.cpp", line 233, in MetaInfoConsistencyCheck
MetaInfoConsistencyCheckUtil(placement, nd_sbp, Optional<Symbol<cfg::NdSbp>>())
File "../oneflow/core/framework/consistency_check.cpp", line 195, in MetaInfoConsistencyCheckUtil
TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
Callback()
Timeout error at 300 seconds.
Traceback (most recent call last):
File "main.py", line 333, in <module>
File "main.py", line 333, in <module>
main(config)
File "main.py", line 113, in main
train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
File "main.py", line 157, in train_one_epoch
outputs = model(samples)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
main(config)
File "main.py", line 113, in main
train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
File "main.py", line 155, in train_one_epoch
targets = targets.to_consistent(placement=placement, sbp=split_sbp)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/consistent_cast.py", line 105, in to_consistent_op
return flow._C.to_consistent(input, placement, sbp, grad_sbp)
oneflow._oneflow_internal.exception.TimeoutException:
File "../oneflow/core/functional/impl/consistent_cast.cpp", line 359, in operator()
LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)
File "../oneflow/core/functional/impl/consistent_cast.cpp", line 290, in LocalToConsistent
OpInterpUtil::Dispatch<one::Tensor>( *op, {input}, OpExprInterpContext(attrs, parallel_desc, nd_sbp))
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
Dispatch<TensorTuple>(op_expr, inputs, ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
Dispatch(op_expr, inputs, outputs.get(), ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
internal_->Apply(op_expr, inputs, outputs, ctx)
File "../oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp", line 354, in ApplyImpl
WithConsistencyChecked(consistent_tensor, [&]() -> ... Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()); return Maybe<void>::Ok(); })
File "../oneflow/core/framework/tensor_rpc_util.h", line 62, in Call
private_details::BusyWaitAndCheck(ctx)
File "../oneflow/core/framework/tensor_rpc_util.cpp", line 174, in BusyWaitAndCheck
TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
Callback()
Timeout error at 300 seconds.
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 685, in forward
x = self.forward_features(x)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 677, in forward_features
x = layer(x)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 540, in forward
x = blk(x)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 351, in forward
x_windows, mask=self.attn_mask
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
res = self.forward(*args, **kwargs)
File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 185, in forward
self.relative_position_index.view(-1)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/framework/tensor.py", line 78, in _getitem
return flow._C.tensor_getitem(self, key)
oneflow._oneflow_internal.exception.TimeoutException:
File "../oneflow/core/functional/impl/array_functor.cpp", line 1770, in operator()
ApplyAdvancedIndexing(result, tensor_indices)
File "../oneflow/core/functional/tensor_index.cpp", line 338, in ApplyAdvancedIndexing
ToConsistent(packed_indices, placement, std::vector<Symbol<cfg::SbpParallel>>(n, broadcast_sbp), grad_sbp_tuple)
File "../oneflow/core/functional/impl/consistent_cast.cpp", line 354, in operator()
MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, grad_sbp_parallels)
File "../oneflow/core/framework/consistency_check.cpp", line 245, in MetaInfoConsistencyCheck
MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp)
File "../oneflow/core/framework/consistency_check.cpp", line 225, in MetaInfoConsistencyCheck
MetaInfoConsistencyCheckUtil(placement, nd_sbp, grad_nd_sbp)
File "../oneflow/core/framework/consistency_check.cpp", line 195, in MetaInfoConsistencyCheckUtil
TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
Callback()
Timeout error at 300 seconds.
Traceback (most recent call last):
File "main.py", line 333, in <module>
main(config)
File "main.py", line 113, in main
train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
File "main.py", line 154, in train_one_epoch
samples = samples.to_consistent(placement=placement, sbp=split_sbp)
File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/consistent_cast.py", line 105, in to_consistent_op
return flow._C.to_consistent(input, placement, sbp, grad_sbp)
oneflow._oneflow_internal.exception.TimeoutException:
File "../oneflow/core/functional/impl/consistent_cast.cpp", line 359, in operator()
LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)
File "../oneflow/core/functional/impl/consistent_cast.cpp", line 290, in LocalToConsistent
OpInterpUtil::Dispatch<one::Tensor>( *op, {input}, OpExprInterpContext(attrs, parallel_desc, nd_sbp))
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
Dispatch<TensorTuple>(op_expr, inputs, ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
Dispatch(op_expr, inputs, outputs.get(), ctx)
File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
internal_->Apply(op_expr, inputs, outputs, ctx)
File "../oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp", line 354, in ApplyImpl
WithConsistencyChecked(consistent_tensor, [&]() -> ... Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()); return Maybe<void>::Ok(); })
File "../oneflow/core/framework/tensor_rpc_util.h", line 62, in Call
private_details::BusyWaitAndCheck(ctx)
File "../oneflow/core/framework/tensor_rpc_util.cpp", line 174, in BusyWaitAndCheck
TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
Callback()
Timeout error at 300 seconds.
2d sbp 训练过程中发现,每训练几十个epoch之后会卡住然后报错
这个问题不知道后来解决了没有
eager global 1d sbp 完整训练
实验1: mixup on + clip-grad on
CIFAR100
实验结论