Oneflow-Inc / swin-transformer

0 stars 0 forks source link

Swin-Transformer Eager Global 完整训练实验 #10

Closed Ldpe2G closed 2 years ago

Ldpe2G commented 2 years ago

eager global 1d sbp 完整训练

实验1: mixup on + clip-grad on

Framework Init Method Version Epoch Avg Time for Each Epoch Acc@1 Log
pytorch 固定初始化 1.10.1 300 ~50s 78.5% log
oneflow ddp 随机初始化 master nightly 0.7.0.dev20210119+cu112 300 ~58s 78.92% log
oneflow eager global1d sbp 随机初始化 基于 profiling_fuse_ddp 分支加上 https://github.com/Oneflow-Inc/oneflow/pull/7225 的修改 300 ~2m50s 78.35% log

实验结论

Ldpe2G commented 2 years ago

eager global 2d sbp 完整训练

实验1: mixup on + clip-grad on

Framework Init Method Version Epoch Avg Time for Each Epoch Acc@1 Log
oneflow eager global 2d sbp 随机初始化 基于 fix-zeros_like_sbp 分支加上 https://github.com/Oneflow-Inc/oneflow/pull/7225 的修改 300 ~2m30s 78.12% log

实验结论

目前遇到的报错信息:

1.

Timeout error at 300 seconds.
    train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
  File "main.py", line 157, in train_one_epoch
    outputs = model(samples)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 685, in forward
    x = self.forward_features(x)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 677, in forward_features
    x = layer(x)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 540, in forward
    x = blk(x)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 368, in forward
    x = shortcut + self.drop_path(x)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 46, in forward
    return drop_path(x, self.drop_prob, self.training)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 29, in drop_path
    random_tensor = flow.rand(*shape, dtype=x.dtype, placement=x.placement, sbp=x.sbp) + keep_prob
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/random_ops.py", line 137, in rand_op
    requires_grad=requires_grad,
oneflow._oneflow_internal.exception.TimeoutException: 
  File "../oneflow/core/functional/impl/random_functor.cpp", line 135, in operator()
    OpInterpUtil::Dispatch<Tensor>( *op_, {}, OpExprInterpContext(attrs, placement, nd_sbp, distribution_state))
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
    Dispatch<TensorTuple>(op_expr, inputs, ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
    Dispatch(op_expr, inputs, outputs.get(), ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
    internal_->Apply(op_expr, inputs, outputs, ctx)
  File "../oneflow/core/framework/op_interpreter/eager_consistent_op_interpreter.cpp", line 99, in Interpret
    MetaInfoConsistencyCheck(parallel_desc, ctx.nd_sbp)
  File "../oneflow/core/framework/consistency_check.cpp", line 233, in MetaInfoConsistencyCheck
    MetaInfoConsistencyCheckUtil(placement, nd_sbp, Optional<Symbol<cfg::NdSbp>>())
  File "../oneflow/core/framework/consistency_check.cpp", line 195, in MetaInfoConsistencyCheckUtil
    TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
  File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
    SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
  File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
    Callback()
Timeout error at 300 seconds.

2.

Traceback (most recent call last):
  File "main.py", line 333, in <module>
  File "main.py", line 333, in <module>
    main(config)
  File "main.py", line 113, in main
    train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
  File "main.py", line 157, in train_one_epoch
    outputs = model(samples)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    main(config)
  File "main.py", line 113, in main
    train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
  File "main.py", line 155, in train_one_epoch
    targets = targets.to_consistent(placement=placement, sbp=split_sbp)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/consistent_cast.py", line 105, in to_consistent_op
    return flow._C.to_consistent(input, placement, sbp, grad_sbp)
oneflow._oneflow_internal.exception.TimeoutException: 
  File "../oneflow/core/functional/impl/consistent_cast.cpp", line 359, in operator()
    LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)
  File "../oneflow/core/functional/impl/consistent_cast.cpp", line 290, in LocalToConsistent
    OpInterpUtil::Dispatch<one::Tensor>( *op, {input}, OpExprInterpContext(attrs, parallel_desc, nd_sbp))
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
    Dispatch<TensorTuple>(op_expr, inputs, ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
    Dispatch(op_expr, inputs, outputs.get(), ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
    internal_->Apply(op_expr, inputs, outputs, ctx)
  File "../oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp", line 354, in ApplyImpl
    WithConsistencyChecked(consistent_tensor, [&]() ->  ... Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()); return Maybe<void>::Ok(); })
  File "../oneflow/core/framework/tensor_rpc_util.h", line 62, in Call
    private_details::BusyWaitAndCheck(ctx)
  File "../oneflow/core/framework/tensor_rpc_util.cpp", line 174, in BusyWaitAndCheck
    TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
  File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
    SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
  File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
    Callback()
Timeout error at 300 seconds.
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 685, in forward
    x = self.forward_features(x)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 677, in forward_features
    x = layer(x)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 540, in forward
    x = blk(x)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 351, in forward
    x_windows, mask=self.attn_mask
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/module.py", line 81, in __call__
    res = self.forward(*args, **kwargs)
  File "/dataset/ldp_home/swin-transformer/swin_transformer/models/swin_transformer.py", line 185, in forward
    self.relative_position_index.view(-1)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/framework/tensor.py", line 78, in _getitem
    return flow._C.tensor_getitem(self, key)
oneflow._oneflow_internal.exception.TimeoutException: 
  File "../oneflow/core/functional/impl/array_functor.cpp", line 1770, in operator()
    ApplyAdvancedIndexing(result, tensor_indices)
  File "../oneflow/core/functional/tensor_index.cpp", line 338, in ApplyAdvancedIndexing
    ToConsistent(packed_indices, placement, std::vector<Symbol<cfg::SbpParallel>>(n, broadcast_sbp), grad_sbp_tuple)
  File "../oneflow/core/functional/impl/consistent_cast.cpp", line 354, in operator()
    MetaInfoConsistencyCheck(parallel_desc, sbp_parallels, grad_sbp_parallels)
  File "../oneflow/core/framework/consistency_check.cpp", line 245, in MetaInfoConsistencyCheck
    MetaInfoConsistencyCheck(placement, nd_sbp, grad_nd_sbp)
  File "../oneflow/core/framework/consistency_check.cpp", line 225, in MetaInfoConsistencyCheck
    MetaInfoConsistencyCheckUtil(placement, nd_sbp, grad_nd_sbp)
  File "../oneflow/core/framework/consistency_check.cpp", line 195, in MetaInfoConsistencyCheckUtil
    TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
  File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
    SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
  File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
    Callback()
Timeout error at 300 seconds.

3.

Traceback (most recent call last):
  File "main.py", line 333, in <module>
    main(config)
  File "main.py", line 113, in main
    train_one_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler)
  File "main.py", line 154, in train_one_epoch
    samples = samples.to_consistent(placement=placement, sbp=split_sbp)
  File "/home/liangdepeng/ldp/oneflow/python/oneflow/nn/modules/consistent_cast.py", line 105, in to_consistent_op
    return flow._C.to_consistent(input, placement, sbp, grad_sbp)
oneflow._oneflow_internal.exception.TimeoutException: 
  File "../oneflow/core/functional/impl/consistent_cast.cpp", line 359, in operator()
    LocalToConsistent(x, parallel_desc, sbp_parallels, local_to_consistent_op_)
  File "../oneflow/core/functional/impl/consistent_cast.cpp", line 290, in LocalToConsistent
    OpInterpUtil::Dispatch<one::Tensor>( *op, {input}, OpExprInterpContext(attrs, parallel_desc, nd_sbp))
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 139, in Dispatch<oneflow::one::Tensor>
    Dispatch<TensorTuple>(op_expr, inputs, ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter_util.cpp", line 131, in Dispatch<oneflow::one::TensorTuple>
    Dispatch(op_expr, inputs, outputs.get(), ctx)
  File "../oneflow/core/framework/op_interpreter/op_interpreter.cpp", line 96, in Apply
    internal_->Apply(op_expr, inputs, outputs, ctx)
  File "../oneflow/core/framework/op_interpreter/eager_mirrored_op_interpreter.cpp", line 354, in ApplyImpl
    WithConsistencyChecked(consistent_tensor, [&]() ->  ... Data_YouAreNotAllowedToCallThisFuncOutsideThisFile()); return Maybe<void>::Ok(); })
  File "../oneflow/core/framework/tensor_rpc_util.h", line 62, in Call
    private_details::BusyWaitAndCheck(ctx)
  File "../oneflow/core/framework/tensor_rpc_util.cpp", line 174, in BusyWaitAndCheck
    TransportUtil::WaitUntilDoneOrTimeout(*ctx, TransportUtil::TimeoutSeconds())
  File "../oneflow/core/framework/transport_util.cpp", line 39, in WaitUntilDoneOrTimeout
    SpinWaitUntilTimeout([&] { return *ctx.flying_cnt() > 0; }, seconds, TryPrintStackInfo, TransportUtil::BlockingWarningIntervalSeconds())
  File "../oneflow/api/python/gil_foreign_lock_helper.cpp", line 29, in WithScopedRelease
    Callback()
Timeout error at 300 seconds.
yuanms2 commented 2 years ago

2d sbp 训练过程中发现,每训练几十个epoch之后会卡住然后报错

这个问题不知道后来解决了没有