microsoft / CodeT

MIT License
605 stars 76 forks source link

The version of deepspeed. #11

Closed Duducoco closed 1 year ago

Duducoco commented 1 year ago

Which version of deepspeed are you using, I am using version 0.7.7 but it reports an error in Trainer.

Traceback (most recent call last):
  File "run_ner.py", line 411, in <module>
    main()
  File "run_ner.py", line 346, in main
    trainer.train(Traceback (most recent call last):

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train
  File "run_ner.py", line 411, in <module>
Traceback (most recent call last):
  File "run_ner.py", line 411, in <module>
Traceback (most recent call last):
  File "run_ner.py", line 411, in <module>
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(    
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
main()
  File "run_ner.py", line 346, in main
    main()
  File "run_ner.py", line 346, in main
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 125, in initialize
        trainer.train(
trainer.train(
    engine = DeepSpeedEngine(args=args,
      File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 290, in __init__
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train
main()
  File "run_ner.py", line 346, in main
    self._configure_distributed_model(model)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1106, in _configure_distributed_model
    trainer.train(
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
    self._broadcast_model()    model, optimizer, _, lr_scheduler = deepspeed.initialize(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1022, in _broadcast_model

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 125, in initialize
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 125, in initialize
        engine = DeepSpeedEngine(args=args,    
engine = DeepSpeedEngine(args=args,deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 290, in __init__

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 290, in __init__
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
    groups._get_broadcast_src_rank(),
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/utils/groups.py", line 338, in _get_broadcast_src_rank
    self._configure_distributed_model(model)
      File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1106, in _configure_distributed_model
self._configure_distributed_model(model)
          File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1106, in _configure_distributed_model
model, optimizer, _, lr_scheduler = deepspeed.initialize(
return dist.get_global_rank(_get_data_parallel_group(), 0)  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 125, in initialize

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/comm/__init__.py", line 22, in get_global_rank
        engine = DeepSpeedEngine(args=args,if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):    self._broadcast_model()

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1022, in _broadcast_model
AttributeError  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 290, in __init__
: module 'deepspeed.comm.torch' has no attribute 'distributed'
    self._broadcast_model()
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1022, in _broadcast_model
    self._configure_distributed_model(model)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1106, in _configure_distributed_model
    groups._get_broadcast_src_rank(),
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/utils/groups.py", line 338, in _get_broadcast_src_rank
    groups._get_broadcast_src_rank(),
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/utils/groups.py", line 338, in _get_broadcast_src_rank
        return dist.get_global_rank(_get_data_parallel_group(), 0)self._broadcast_model()
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/comm/__init__.py", line 22, in get_global_rank

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1022, in _broadcast_model
        return dist.get_global_rank(_get_data_parallel_group(), 0)if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):

  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/comm/__init__.py", line 22, in get_global_rank
AttributeError: module 'deepspeed.comm.torch' has no attribute 'distributed'
        if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
groups._get_broadcast_src_rank(),
AttributeError  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/utils/groups.py", line 338, in _get_broadcast_src_rank
: module 'deepspeed.comm.torch' has no attribute 'distributed'
    return dist.get_global_rank(_get_data_parallel_group(), 0)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/comm/__init__.py", line 22, in get_global_rank
    if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
AttributeError: module 'deepspeed.comm.torch' has no attribute 'distributed'

I have not made any changes to the code and think there may be a problem with one of the library versions, so I hope you can help me.

linzeqipku commented 1 year ago

0.6.1.

This is the image build log (grep Successfully installed)

Successfully installed Click-8.1.2 GitPython-3.1.27 configparser-5.2.0 docker-pycreds-0.4.0 gitdb-4.0.9 importlib-metadata-4.11.3 pathtools-0.1.2 promise-2.3 protobuf-3.20.0 python-dateutil-2.8.2 sentry-sdk-1.5.10 shortuuid-1.0.8 smmap-5.0.0 subprocess32-3.5.4 termcolor-1.1.0 typing-extensions-4.1.1 wandb-0.12.7 yaspin-2.1.0 zipp-3.8.0
Successfully installed future-0.18.2
Successfully installed numpy-1.20.3
Successfully installed huggingface-hub-0.0.8 joblib-1.1.0 packaging-21.3 pyparsing-3.0.8 regex-2022.3.15 sacremoses-0.0.49 tokenizers-0.10.3 transformers-4.6.0
Successfully installed datasets-1.11.0 dill-0.3.4 fsspec-2022.3.0 multiprocess-0.70.12.2 pandas-1.3.5 pyarrow-7.0.0 xxhash-3.0.0
Successfully installed nltk-3.7
Successfully installed absl-py-1.0.0 rouge-score-0.0.4
Successfully installed portalocker-2.0.0 sacrebleu-1.5.1
Successfully installed sentencepiece-0.1.96
Successfully installed scikit-learn-1.0.2 scipy-1.7.3 sklearn-0.0 threadpoolctl-3.1.0
Successfully installed deepspeed-0.6.1 hjson-3.0.2 ninja-1.10.2.3 py-cpuinfo-8.0.0
Successfully installed seqeval-1.2.2
Successfully installed conllu-4.4.1
Successfully installed multiset-3.0.1
Successfully installed PyJWT-2.3.0 SecretStorage-3.3.1 adal-1.2.7 applicationinsights-0.11.10 argcomplete-1.12.3 azure-common-1.1.28 azure-core-1.21.1 azure-graphrbac-0.61.1 azure-identity-1.7.0 azure-mgmt-authorization-0.61.0 azure-mgmt-containerregistry-8.2.0 azure-mgmt-core-1.3.0 azure-mgmt-keyvault-9.3.0 azure-mgmt-resource-20.1.0 azure-mgmt-storage-19.1.0 azureml-automl-core-1.40.0 azureml-core-1.40.0.post2 azureml-dataprep-3.0.2 azureml-dataprep-native-38.0.0 azureml-dataprep-rslex-2.4.2 azureml-dataset-runtime-1.40.0 azureml-pipeline-1.40.0 azureml-pipeline-core-1.40.0 azureml-pipeline-steps-1.40.0 azureml-sdk-1.40.0 azureml-telemetry-1.40.0 azureml-train-automl-client-1.40.0 azureml-train-core-1.40.0 azureml-train-restclients-hyperdrive-1.40.0 backports.tempfile-1.0 backports.weakref-1.0.post1 bcrypt-3.2.0 cloudpickle-2.0.0 contextlib2-21.6.0 distro-1.7.0 docker-5.0.3 dotnetcore2-2.1.23 fusepy-3.0.1 humanfriendly-10.0 isodate-0.6.1 jeepney-0.8.0 jmespath-0.10.0 jsonpickle-2.1.0 knack-0.9.0 msal-1.17.0 msal-extensions-0.3.1 msrest-0.6.21 msrestazure-0.6.4 ndg-httpsclient-0.5.1 oauthlib-3.2.0 paramiko-2.10.3 pathspec-0.9.0 pyarrow-3.0.0 pyasn1-0.4.8 pynacl-1.5.0 requests-oauthlib-1.3.1 tabulate-0.8.9 websocket-client-1.3.2
Duducoco commented 1 year ago

I installed all the packages you listed above. Now the error message is:

Traceback (most recent call last):
  File "run_ner.py", line 411, in <module>
    main()
  File "run_ner.py", line 346, in main
    trainer.train(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 119, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 293, in __init__
    self._configure_optimizer(optimizer, model_parameters)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1088, in _configure_optimizer
    self.optimizer = self._configure_zero_optimizer(basic_optimizer)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1317, in _configure_zero_optimizer
    optimizer = DeepSpeedZeroOptimizer(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 145, in __init__
    util_ops = UtilsBuilder().load()
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 458, in load
    return self.jit_load(verbose)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 500, in jit_load
    op_module = load(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 969, in load
    return _jit_compile(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1176, in _jit_compile
    _write_ninja_file_and_build_library(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1256, in _write_ninja_file_and_build_library
    check_compiler_abi_compatibility(compiler)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 252, in check_compiler_abi_compatibility
    if not check_compiler_ok_for_platform(compiler):
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 212, in check_compiler_ok_for_platform
    which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/subprocess.py", line 415, in check_output
    return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/subprocess.py", line 516, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['which', 'c++']' returned non-zero exit status 1.
Loading extension module utils...

The version of python is 3.8, the version of torch is 1.7.0+cu110 How can I fix it.

Duducoco commented 1 year ago

I installed all the packages you listed above. Now the error message is:

Traceback (most recent call last):
  File "run_ner.py", line 411, in <module>
    main()
  File "run_ner.py", line 346, in main
    trainer.train(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/trainer.py", line 1113, in train
    deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/transformers/integrations.py", line 517, in deepspeed_init
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/__init__.py", line 119, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 293, in __init__
    self._configure_optimizer(optimizer, model_parameters)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1088, in _configure_optimizer
    self.optimizer = self._configure_zero_optimizer(basic_optimizer)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1317, in _configure_zero_optimizer
    optimizer = DeepSpeedZeroOptimizer(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 145, in __init__
    util_ops = UtilsBuilder().load()
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 458, in load
    return self.jit_load(verbose)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 500, in jit_load
    op_module = load(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 969, in load
    return _jit_compile(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1176, in _jit_compile
    _write_ninja_file_and_build_library(
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1256, in _write_ninja_file_and_build_library
    check_compiler_abi_compatibility(compiler)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 252, in check_compiler_abi_compatibility
    if not check_compiler_ok_for_platform(compiler):
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 212, in check_compiler_ok_for_platform
    which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/subprocess.py", line 415, in check_output
    return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
  File "/home/songyiping/anaconda3/envs/diverse/lib/python3.8/subprocess.py", line 516, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command '['which', 'c++']' returned non-zero exit status 1.
Loading extension module utils...

The version of python is 3.8, the version of torch is 1.7.0+cu110 How can I fix it.

I have solved this problem, the reason for the above problem is that my server does not have gcc, after installing gcc it works