File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/training_args.py:865, in TrainingArguments.__post_init__(self)
857 warnings.warn(
858 "--adafactor is deprecated and will be removed in version 5 of 🤗 Transformers. Use --optim adafactor instead",
859 FutureWarning,
860 )
861 self.optim = OptimizerNames.ADAFACTOR
863 if (
864 is_torch_available()
--> 865 and (self.device.type != "cuda")
866 and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ)
867 and (self.fp16 or self.fp16_full_eval or self.bf16 or self.bf16_full_eval)
868 ):
869 raise ValueError(
870 "Mixed precision training with AMP or APEX (--fp16 or --bf16) and half precision evaluation (--fp16_full_eval or --bf16_full_eval) can only be used on CUDA devices."
871 )
873 if is_torch_available() and self.tf32 is not None:
File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/training_args.py:1024, in TrainingArguments._setup_devices(self)
1020 @cached_property
1021 @torch_required
1022 def _setup_devices(self) -> "torch.device":
1023 logger.info("PyTorch: setting up devices")
-> 1024 if torch.distributed.is_initialized() and self.local_rank == -1:
1025 logger.warning(
1026 "torch.distributed process group is initialized, but local_rank == -1. "
1027 "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
1028 )
1029 if self.no_cuda:
AttributeError: module 'torch.distributed' has no attribute 'is_initialized'
To Reproduce
Steps to reproduce the behavior:
*note: the notebook is running on Mac M1 on CPU
from transformers import Trainer, TrainingArguments
Information
The problem arises in chapter:
Describe the bug
This error appears when trying to initalize training_args
AttributeError Traceback (most recent call last) Input In [69], in <cell line: 6>() 4 logging_steps = len(emotions_encoded["train"]) // batch_size 5 model_name = f"{model_ckpt}-finetuned-emotion" ----> 6 training_args = TrainingArguments(output_dir=model_name, 7 num_train_epochs=2, 8 learning_rate=2e-5, 9 per_device_train_batch_size=batch_size, 10 per_device_eval_batch_size=batch_size, 11 weight_decay=0.01, 12 evaluation_strategy="epoch", 13 disable_tqdm=False, 14 logging_steps=logging_steps, 15 push_to_hub=False, 16 log_level="error")
File:91, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_on_each_node, no_cuda, seed, data_seed, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, xpu_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, sharded_ddp, deepspeed, label_smoothing_factor, optim, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, dataloader_pin_memory, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, gradient_checkpointing, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters)
File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/training_args.py:865, in TrainingArguments.__post_init__(self) 857 warnings.warn( 858 "
--adafactor
is deprecated and will be removed in version 5 of 🤗 Transformers. Use--optim adafactor
instead", 859 FutureWarning, 860 ) 861 self.optim = OptimizerNames.ADAFACTOR 863 if ( 864 is_torch_available() --> 865 and (self.device.type != "cuda") 866 and not (self.device.type == "xla" and "GPU_NUM_DEVICES" in os.environ) 867 and (self.fp16 or self.fp16_full_eval or self.bf16 or self.bf16_full_eval) 868 ): 869 raise ValueError( 870 "Mixed precision training with AMP or APEX (--fp16
or--bf16
) and half precision evaluation (--fp16_full_eval
or--bf16_full_eval
) can only be used on CUDA devices." 871 ) 873 if is_torch_available() and self.tf32 is not None:File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/utils/import_utils.py:781, in torch_required..wrapper(*args, kwargs)
778 @wraps(func)
779 def wrapper(*args, *kwargs):
780 if is_torch_available():
--> 781 return func(args, kwargs)
782 else:
783 raise ImportError(f"Method
{func.__name__}
requires PyTorch.")File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/training_args.py:1099, in TrainingArguments.device(self) 1093 @property 1094 @torch_required 1095 def device(self) -> "torch.device": 1096 """ 1097 The device used by this process. 1098 """ -> 1099 return self._setup_devices
File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/utils/generic.py:48, in cached_property.get(self, obj, objtype) 46 cached = getattr(obj, attr, None) 47 if cached is None: ---> 48 cached = self.fget(obj) 49 setattr(obj, attr, cached) 50 return cached
File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/utils/import_utils.py:781, in torch_required..wrapper(*args, kwargs)
778 @wraps(func)
779 def wrapper(*args, *kwargs):
780 if is_torch_available():
--> 781 return func(args, kwargs)
782 else:
783 raise ImportError(f"Method
{func.__name__}
requires PyTorch.")File ~/miniforge3/envs/TFM006/lib/python3.10/site-packages/transformers/training_args.py:1024, in TrainingArguments._setup_devices(self) 1020 @cached_property 1021 @torch_required 1022 def _setup_devices(self) -> "torch.device": 1023 logger.info("PyTorch: setting up devices") -> 1024 if torch.distributed.is_initialized() and self.local_rank == -1: 1025 logger.warning( 1026 "torch.distributed process group is initialized, but local_rank == -1. " 1027 "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch" 1028 ) 1029 if self.no_cuda:
AttributeError: module 'torch.distributed' has no attribute 'is_initialized'
To Reproduce
Steps to reproduce the behavior:
*note: the notebook is running on Mac M1 on CPU
from transformers import Trainer, TrainingArguments
batch_size = 64 logging_steps = len(emotions_encoded["train"]) // batch_size model_name = f"{model_ckpt}-finetuned-emotion" training_args = TrainingArguments(output_dir=model_name, num_train_epochs=2, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, evaluation_strategy="epoch", disable_tqdm=False, logging_steps=logging_steps, push_to_hub=False, log_level="error")