OpenMOSS / MOSS

An open-source tool-augmented conversational language model from Fudan University
https://txsun1997.github.io/blogs/moss.html
Apache License 2.0
11.9k stars 1.14k forks source link

训练时出现No such file or directory: 'which' #290

Closed Tian14267 closed 1 year ago

Tian14267 commented 1 year ago

大神们好,我在finetune训练的时候,又遇到下面这个问题:

FileNotFoundError: [Errno 2] No such file or directory: 'which'
[09:45:37] ERROR    failed (exitcode: 1) local_rank: 0 (pid: 33471) of binary: /root/anaconda3/envs/moss/bin/python

详细情况:


╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /data/MOSS_0516/finetune_moss_2.py:310 in <module>                                               │
│                                                                                                  │
│   307 │   os.makedirs(args.output_dir, exist_ok=True)                                            │
│   308 │                                                                                          │
│   309 │   set_seed(args.seed)                                                                    │
│ ❱ 310 │   train(args)                                                                            │
│   311                                                                                            │
│                                                                                                  │
│ /data/MOSS_0516/finetune_moss_2.py:215 in train                                                  │
│                                                                                                  │
│   212 │   num_training_steps = (len(train_dataloader) * args.n_epochs) // accelerator.gradient   │
│   213 │   lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(args.   │
│   214 │                                                                                          │
│ ❱ 215 │   model, optimizer, train_dataloader, val_dataloader, lr_scheduler = accelerator.prepa   │
│   216 │                                                                                          │
│   217 │   global_step = 0                                                                        │
│   218 │   metric = SFTMetric(device=torch.cuda.current_device())                                 │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/accelerate/accelerator.py:1139 in prepare  │
│                                                                                                  │
│   1136 │   │   │   if self.device.type == "cpu" and self.state.ipex_plugin is not None:          │
│   1137 │   │   │   │   args = self._prepare_ipex(*args)                                          │
│   1138 │   │   if self.distributed_type == DistributedType.DEEPSPEED:                            │
│ ❱ 1139 │   │   │   result = self._prepare_deepspeed(*args)                                       │
│   1140 │   │   elif self.distributed_type == DistributedType.MEGATRON_LM:                        │
│   1141 │   │   │   result = self._prepare_megatron_lm(*args)                                     │
│   1142 │   │   else:                                                                             │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/accelerate/accelerator.py:1446 in          │
│ _prepare_deepspeed                                                                               │
│                                                                                                  │
│   1443 │   │   │   │   │   │   if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VA  │
│   1444 │   │   │   │   │   │   │   kwargs["lr_scheduler"] = scheduler                            │
│   1445 │   │   │                                                                                 │
│ ❱ 1446 │   │   │   engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)           │
│   1447 │   │   │   if optimizer is not None:                                                     │
│   1448 │   │   │   │   optimizer = DeepSpeedOptimizerWrapper(optimizer)                          │
│   1449 │   │   │   if scheduler is not None:                                                     │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/__init__.py:165 in initialize    │
│                                                                                                  │
│   162 │   │   │   │   │   │   │   │   │   │      config=config,                                  │
│   163 │   │   │   │   │   │   │   │   │   │      config_class=config_class)                      │
│   164 │   │   else:                                                                              │
│ ❱ 165 │   │   │   engine = DeepSpeedEngine(args=args,                                            │
│   166 │   │   │   │   │   │   │   │   │    model=model,                                          │
│   167 │   │   │   │   │   │   │   │   │    optimizer=optimizer,                                  │
│   168 │   │   │   │   │   │   │   │   │    model_parameters=model_parameters,                    │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/runtime/engine.py:308 in         │
│ __init__                                                                                         │
│                                                                                                  │
│    305 │   │   │   model_parameters = list(model_parameters)                                     │
│    306 │   │                                                                                     │
│    307 │   │   if has_optimizer:                                                                 │
│ ❱  308 │   │   │   self._configure_optimizer(optimizer, model_parameters)                        │
│    309 │   │   │   self._configure_lr_scheduler(lr_scheduler)                                    │
│    310 │   │   │   self._report_progress(0)                                                      │
│    311 │   │   elif self.zero_optimization():                                                    │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1173 in        │
│ _configure_optimizer                                                                             │
│                                                                                                  │
│   1170 │   │   optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer)              │
│   1171 │   │                                                                                     │
│   1172 │   │   if optimizer_wrapper == ZERO_OPTIMIZATION:                                        │
│ ❱ 1173 │   │   │   self.optimizer = self._configure_zero_optimizer(basic_optimizer)              │
│   1174 │   │   elif optimizer_wrapper == AMP:                                                    │
│   1175 │   │   │   amp_params = self.amp_params()                                                │
│   1176 │   │   │   log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0])      │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1463 in        │
│ _configure_zero_optimizer                                                                        │
│                                                                                                  │
│   1460 │   │   │   │                                                                             │
│   1461 │   │   │   │   log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ra  │
│   1462 │   │   │   │   from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3   │
│ ❱ 1463 │   │   │   │   optimizer = DeepSpeedZeroOptimizer_Stage3(                                │
│   1464 │   │   │   │   │   self.module,                                                          │
│   1465 │   │   │   │   │   optimizer,                                                            │
│   1466 │   │   │   │   │   timers=timers,                                                        │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py:130 in    │
│ __init__                                                                                         │
│                                                                                                  │
│    127 │   │   self.optimizer = init_optimizer                                                   │
│    128 │   │                                                                                     │
│    129 │   │   # Load pre-built or JIT compile (un)flatten ops                                   │
│ ❱  130 │   │   util_ops = UtilsBuilder().load()                                                  │
│    131 │   │   self.flatten = util_ops.flatten                                                   │
│    132 │   │   self.unflatten = util_ops.unflatten                                               │
│    133 │   │   self.dtype = self.optimizer.param_groups[0]['params'][0].dtype                    │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py:445 in │
│ load                                                                                             │
│                                                                                                  │
│   442 │   │   │                                                                                  │
│   443 │   │   │   return importlib.import_module(self.absolute_name())                           │
│   444 │   │   else:                                                                              │
│ ❱ 445 │   │   │   return self.jit_load(verbose)                                                  │
│   446 │                                                                                          │
│   447 │   def jit_load(self, verbose=True):                                                      │
│   448 │   │   if not self.is_compatible(verbose):                                                │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py:480 in │
│ jit_load                                                                                         │
│                                                                                                  │
│   477 │   │   │   torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")                       │
│   478 │   │   │   os.environ["TORCH_CUDA_ARCH_LIST"] = ""                                        │
│   479 │   │                                                                                      │
│ ❱ 480 │   │   op_module = load(name=self.name,                                                   │
│   481 │   │   │   │   │   │    sources=self.strip_empty_entries(sources),                        │
│   482 │   │   │   │   │   │    extra_include_paths=self.strip_empty_entries(extra_include_path   │
│   483 │   │   │   │   │   │    extra_cflags=self.strip_empty_entries(self.cxx_args()),           │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/torch/utils/cpp_extension.py:1284 in load  │
│                                                                                                  │
│   1281 │   │   ...     extra_cflags=['-O2'],                                                     │
│   1282 │   │   ...     verbose=True)                                                             │
│   1283 │   '''                                                                                   │
│ ❱ 1284 │   return _jit_compile(                                                                  │
│   1285 │   │   name,                                                                             │
│   1286 │   │   [sources] if isinstance(sources, str) else sources,                               │
│   1287 │   │   extra_cflags,                                                                     │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/torch/utils/cpp_extension.py:1508 in       │
│ _jit_compile                                                                                     │
│                                                                                                  │
│   1505 │   │   │   │   │   │                                                                     │
│   1506 │   │   │   │   │   │   sources = list(hipified_sources)                                  │
│   1507 │   │   │   │   │                                                                         │
│ ❱ 1508 │   │   │   │   │   _write_ninja_file_and_build_library(                                  │
│   1509 │   │   │   │   │   │   name=name,                                                        │
│   1510 │   │   │   │   │   │   sources=sources,                                                  │
│   1511 │   │   │   │   │   │   extra_cflags=extra_cflags or [],                                  │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/torch/utils/cpp_extension.py:1597 in       │
│ _write_ninja_file_and_build_library                                                              │
│                                                                                                  │
│   1594 │   │   compiler = os.environ.get('CXX', 'cl')                                            │
│   1595 │   else:                                                                                 │
│   1596 │   │   compiler = os.environ.get('CXX', 'c++')                                           │
│ ❱ 1597 │   get_compiler_abi_compatibility_and_version(compiler)                                  │
│   1598 │   if with_cuda is None:                                                                 │
│   1599 │   │   with_cuda = any(map(_is_cuda_file, sources))                                      │
│   1600 │   extra_ldflags = _prepare_ldflags(                                                     │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/torch/utils/cpp_extension.py:336 in        │
│ get_compiler_abi_compatibility_and_version                                                       │
│                                                                                                  │
│    333 │   │   return (True, TorchVersion('0.0.0'))                                              │
│    334 │                                                                                         │
│    335 │   # First check if the compiler is one of the expected ones for the particular platfor  │
│ ❱  336 │   if not check_compiler_ok_for_platform(compiler):                                      │
│    337 │   │   warnings.warn(WRONG_COMPILER_WARNING.format(                                      │
│    338 │   │   │   user_compiler=compiler,                                                       │
│    339 │   │   │   pytorch_compiler=_accepted_compilers_for_platform()[0],                       │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/site-packages/torch/utils/cpp_extension.py:290 in        │
│ check_compiler_ok_for_platform                                                                   │
│                                                                                                  │
│    287 │   '''                                                                                   │
│    288 │   if IS_WINDOWS:                                                                        │
│    289 │   │   return True                                                                       │
│ ❱  290 │   which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)        │
│    291 │   # Use os.path.realpath to resolve any symlinks, in particular from 'c++' to e.g. 'g+  │
│    292 │   compiler_path = os.path.realpath(which.decode(*SUBPROCESS_DECODE_ARGS).strip())       │
│    293 │   # Check the compiler name                                                             │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/subprocess.py:415 in check_output                        │
│                                                                                                  │
│    412 │   │   │   empty = b''                                                                   │
│    413 │   │   kwargs['input'] = empty                                                           │
│    414 │                                                                                         │
│ ❱  415 │   return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,                      │
│    416 │   │   │      **kwargs).stdout                                                           │
│    417                                                                                           │
│    418                                                                                           │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/subprocess.py:493 in run                                 │
│                                                                                                  │
│    490 │   │   kwargs['stdout'] = PIPE                                                           │
│    491 │   │   kwargs['stderr'] = PIPE                                                           │
│    492 │                                                                                         │
│ ❱  493 │   with Popen(*popenargs, **kwargs) as process:                                          │
│    494 │   │   try:                                                                              │
│    495 │   │   │   stdout, stderr = process.communicate(input, timeout=timeout)                  │
│    496 │   │   except TimeoutExpired as exc:                                                     │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/subprocess.py:858 in __init__                            │
│                                                                                                  │
│    855 │   │   │   │   │   self.stderr = io.TextIOWrapper(self.stderr,                           │
│    856 │   │   │   │   │   │   │   encoding=encoding, errors=errors)                             │
│    857 │   │   │                                                                                 │
│ ❱  858 │   │   │   self._execute_child(args, executable, preexec_fn, close_fds,                  │
│    859 │   │   │   │   │   │   │   │   pass_fds, cwd, env,                                       │
│    860 │   │   │   │   │   │   │   │   startupinfo, creationflags, shell,                        │
│    861 │   │   │   │   │   │   │   │   p2cread, p2cwrite,                                        │
│                                                                                                  │
│ /root/anaconda3/envs/moss/lib/python3.8/subprocess.py:1704 in _execute_child                     │
│                                                                                                  │
│   1701 │   │   │   │   │   │   err_filename = orig_executable                                    │
│   1702 │   │   │   │   │   if errno_num != 0:                                                    │
│   1703 │   │   │   │   │   │   err_msg = os.strerror(errno_num)                                  │
│ ❱ 1704 │   │   │   │   │   raise child_exception_type(errno_num, err_msg, err_filename)          │
│   1705 │   │   │   │   raise child_exception_type(err_msg)                                       │
│   1706                                                                                           │
│   1707                                                                                           │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
FileNotFoundError: [Errno 2] No such file or directory: 'which'
[09:45:37] ERROR    failed (exitcode: 1) local_rank: 0 (pid: 33471) of binary: /root/anaconda3/envs/moss/bin/python      

请问这是啥问题啊

Tian14267 commented 1 year ago
Traceback (most recent call last):
  File "finetune_moss_2.py", line 310, in <module>
    train(args)           
  File "finetune_moss_2.py", line 215, in train
    model, optimizer, train_dataloader, val_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, train_dataloader, val_dataloader, lr_scheduler)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/accelerate/accelerator.py", line 1139, in prepare
    result = self._prepare_deepspeed(*args)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/accelerate/accelerator.py", line 1446, in _prepare_deepspeed
    engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/__init__.py", line 165, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 308, in __init__
    self._configure_optimizer(optimizer, model_parameters)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1173, in _configure_optimizer
    self.optimizer = self._configure_zero_optimizer(basic_optimizer)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1463, in _configure_zero_optimizer
    optimizer = DeepSpeedZeroOptimizer_Stage3(
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 130, in __init__
    util_ops = UtilsBuilder().load()
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 445, in load
    return self.jit_load(verbose)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/deepspeed/ops/op_builder/builder.py", line 480, in jit_load
    op_module = load(name=self.name,
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1284, in load
    return _jit_compile(
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1508, in _jit_compile
    _write_ninja_file_and_build_library(
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1597, in _write_ninja_file_and_build_library
    get_compiler_abi_compatibility_and_version(compiler)
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 336, in get_compiler_abi_compatibility_and_version
    if not check_compiler_ok_for_platform(compiler):
  File "/root/anaconda3/envs/mt5/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 290, in check_compiler_ok_for_platform
    which = subprocess.check_output(['which', compiler], stderr=subprocess.STDOUT)
  File "/root/anaconda3/envs/mt5/lib/python3.8/subprocess.py", line 415, in check_output
    return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
  File "/root/anaconda3/envs/mt5/lib/python3.8/subprocess.py", line 493, in run
    with Popen(*popenargs, **kwargs) as process:
  File "/root/anaconda3/envs/mt5/lib/python3.8/subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/root/anaconda3/envs/mt5/lib/python3.8/subprocess.py", line 1704, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'which'

这个也是~~~