Open apachemycat opened 1 year ago
+1
next train try ,new error ... e RuntimeError(f"{func} must return None or a tuple of ( │ │ 278 │ │ │ │ │ │ │ │ │ │ │ f"but got {result}.")
/usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1696 in train │
│ │
│ 1693 │ │ inner_training_loop = find_executable_batch_size( │
│ 1694 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │
│ 1695 │ │ ) │
│ ❱ 1696 │ │ return inner_training_loop( │
│ 1697 │ │ │ args=args, │
│ 1698 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │
│ 1699 │ │ │ trial=trial, │
│ │
│ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:2040 in _inner_training_loop │
│ │
│ 2037 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │
│ 2038 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │
│ 2039 │ │ │ │ │ else: │
│ ❱ 2040 │ │ │ │ │ │ self.optimizer.step() │
│ 2041 │ │ │ │ │ │
│ 2042 │ │ │ │ │ if optimizer_was_run and not self.deepspeed: │
│ 2043 │ │ │ │ │ │ # Delay optimizer scheduling until metrics are generated │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/optim/lr_scheduler.py:69 in wrapper │
│ │
│ 66 │ │ │ │ instance = instance_ref() │
│ 67 │ │ │ │ instance._step_count += 1 │
│ 68 │ │ │ │ wrapped = func.get(instance, cls) │
│ ❱ 69 │ │ │ │ return wrapped(*args, kwargs) │
│ 70 │ │ │ │
│ 71 │ │ │ # Note that the returned function here is no longer a bound method, │
│ 72 │ │ │ # so attributes like __func__
and __self__
no longer exist. │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/optim/optimizer.py:280 in wrapper │
│ │
│ 277 │ │ │ │ │ │ │ raise RuntimeError(f"{func} must return None or a tuple of ( │
│ 278 │ │ │ │ │ │ │ │ │ │ │ f"but got {result}.") │
│ 279 │ │ │ │ │
│ ❱ 280 │ │ │ │ out = func(*args, *kwargs) │
│ 281 │ │ │ │ self._optimizer_step_code() │
│ 282 │ │ │ │ │
│ 283 │ │ │ │ # call optimizer step post hooks │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py:115 in decorate_context │
│ │
│ 112 │ @functools.wraps(func) │
│ 113 │ def decorate_context(args, kwargs): │
│ 114 │ │ with ctx_factory(): │
│ ❱ 115 │ │ │ return func(*args, **kwargs) │
│ 116 │ │
│ 117 │ return decorate_context │
│ 118 │
│ │
│ /usr/local/lib/python3.8/dist-packages/bitsandbytes/optim/optimizer.py:270 in step │
│ │
│ 267 │ │ │ │ │
│ 268 │ │ │ │ self.prefetch_state(p) │
│ 269 │ │ │ │ self.update_step(group, p, gindex, pindex) │
│ ❱ 270 │ │ │ │ torch.cuda.synchronize() │
│ 271 │ │ if self.is_paged: │
│ 272 │ │ │ # all paged operation are asynchronous, we need │
│ 273 │ │ │ # to sync to make sure all tensors are in the right state │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/cuda/init.py:688 in synchronize │
│ │
│ 685 │ """ │
│ 686 │ _lazy_init() │
│ 687 │ with torch.cuda.device(device): │
│ ❱ 688 │ │ return torch._C._cuda_synchronize() │
│ 689 │
│ 690 │
│ 691 def ipc_collect(): │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be
incorrect.
lastest versin still error :
/usr/local/lib/python3.8/dist-packages/bitsandbytes/optim/optimizer.py:270 in step │
│ │
│ 267 │ │ │ │ │
│ 268 │ │ │ │ self.prefetch_state(p) │
│ 269 │ │ │ │ self.update_step(group, p, gindex, pindex) │
│ ❱ 270 │ │ │ │ torch.cuda.synchronize() │
│ 271 │ │ if self.is_paged: │
│ 272 │ │ │ # all paged operation are asynchronous, we need │
│ 273 │ │ │ # to sync to make sure all tensors are in the right state │
│ │
│ /usr/local/lib/python3.8/dist-packages/torch/cuda/init.py:688 in synchronize │
│ │
│ 685 │ """ │
│ 686 │ _lazy_init() │
│ 687 │ with torch.cuda.device(device): │
│ ❱ 688 │ │ return torch._C._cuda_synchronize() │
│ 689 │
│ 690 │
│ 691 def ipc_collect(): │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
7%|███████▏
Can you please try training by loading the adapter weights with resume_from_checkpoint ?
Error an illegal memory access was encountered at line 117 in file /home/tim/git/bitsandbytes/csrc/ops.cu /arrow/cpp/src/arrow/filesystem/s3fs.cc:2598: arrow::fs::FinalizeS3 was not called even though S3 was initialized. This could lead to a segmentation fault at exit
Can you please try training by loading the adapter weights with resume_from_checkpoint ?
but No resume param and logic ...
python qlora.py \ --model_name_or_path /models/guanaco-33b-merged \ --output_dir ./output \ --dataset alpaca \ --do_train True \ --do_eval True \ --do_mmlu_eval True \ --source_max_len 384 \ --target_max_len 2048 \ --per_device_train_batch_size 4 \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 4 \ --logging_steps 10 \ --max_steps 10000 \ --save_strategy steps \ --data_seed 42 \ --save_steps 1000 \ --save_total_limit 30 \ --evaluation_strategy steps \ --eval_dataset_size 20 \ --max_eval_samples 1000 \ --eval_steps 1000 \ --optim paged_adamw_32bit \
root@5aa880cf02a4:/wzh/qlora# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:24 sh scripts/finetune.sh CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so CUDA SETUP: Highest compute capability among GPUs detected: 8.0 CUDA SETUP: Detected CUDA version 117 CUDA SETUP: Loading binary /usr/local/lib/python3.8/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so... loading base model /models/guanaco-33b-merged... Loading checkpoint shards: 100%|████████████████████████████████████████████████████| 7/7 [01:32<00:00, 13.19s/it] adding LoRA modules...
trainable params: 243793920.0 || all params: 16965454336 || trainable: 1.4370020110966275 loaded model Adding special tokens. Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-2e4dc035c7efa1dc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4... Downloading data files: 100%|█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4821.04it/s] Extracting data files: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00, 507.78it/s] Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-2e4dc035c7efa1dc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data. 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 357.02it/s] Splitting train dataset in train and validation according to
eval_dataset_size
Found cached dataset json (/root/.cache/huggingface/datasets/json/default-0b99aebaad2bbfcb/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4) 100%|███████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 7.23it/s] torch.float32 914390528 0.053897160218144315 torch.uint8 16051077120 0.9461028397818557 {'loss': 1.3295, 'learning_rate': 0.0002, 'epoch': 2.86}
{'loss': 0.2541, 'learning_rate': 0.0002, 'epoch': 5.71}
{'loss': 0.0954, 'learning_rate': 0.0002, 'epoch': 8.57}
{'loss': 0.0608, 'learning_rate': 0.0002, 'epoch': 11.43}
{'loss': 0.0556, 'learning_rate': 0.0002, 'epoch': 14.29}
{'loss': 0.0388, 'learning_rate': 0.0002, 'epoch': 17.14}
1%|▍ | 61/10000 [47:58<121:52:32, 44.14s/it]╭─────────────────────────────── Traceback (most recent call last)
__Error_____
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /wzh/qlora/qlora.py:789 in │
│ │
│ 786 │ │ │ fout.write(json.dumps(all_metrics)) │
│ 787 │
│ 788 if name == "main": │
│ ❱ 789 │ train() │
│ 790 │
│ │
│ /wzh/qlora/qlora.py:751 in train │
│ │
│ 748 │ │ logger.info(" Train ") │
│ 749 │ │ # Note:
resume_from_checkpoint
not supported for adapter checkpoints by HF. │ │ 750 │ │ # Currently adapter checkpoint is reloaded as expected but optimizer/scheduler s │ │ ❱ 751 │ │ train_result = trainer.train() │ │ 752 │ │ metrics = train_result.metrics │ │ 753 │ │ trainer.log_metrics("train", metrics) │ │ 754 │ │ trainer.save_metrics("train", metrics) │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:1696 in train │ │ │ │ 1693 │ │ inner_training_loop = find_executable_batch_size( │ │ 1694 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1695 │ │ ) │ │ ❱ 1696 │ │ return inner_training_loop( │ │ 1697 │ │ │ args=args, │ │ 1698 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1699 │ │ │ trial=trial, │ │ │ │ /usr/local/lib/python3.8/dist-packages/transformers/trainer.py:2040 in _inner_training_loop │ │ │ │ 2037 │ │ │ │ │ │ scale_after = self.scaler.get_scale() │ │ 2038 │ │ │ │ │ │ optimizer_was_run = scale_before <= scale_after │ │ 2039 │ │ │ │ │ else: │ │ ❱ 2040 │ │ │ │ │ │ self.optimizer.step() │ │ 2041 │ │ │ │ │ │ │ 2042 │ │ │ │ │ if optimizer_was_run and not self.deepspeed: │ │ 2043 │ │ │ │ │ │ # Delay optimizer scheduling until metrics are generated │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/optim/lr_scheduler.py:69 in wrapper │ │ │ │ 66 │ │ │ │ instance = instance_ref() │ │ 67 │ │ │ │ instance._step_count += 1 │ │ 68 │ │ │ │ wrapped = func.get(instance, cls) │ │ ❱ 69 │ │ │ │ return wrapped(*args, kwargs) │ │ 70 │ │ │ │ │ 71 │ │ │ # Note that the returned function here is no longer a bound method, │ │ 72 │ │ │ # so attributes like__func__
and__self__
no longer exist. │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/optim/optimizer.py:280 in wrapper │ │ │ │ 277 │ │ │ │ │ │ │ raise RuntimeError(f"{func} must return None or a tuple of ( │ │ 278 │ │ │ │ │ │ │ │ │ │ │ f"but got {result}.") │ │ 279 │ │ │ │ │ │ ❱ 280 │ │ │ │ out = func(*args, *kwargs) │ │ 281 │ │ │ │ self._optimizer_step_code() │ │ 282 │ │ │ │ │ │ 283 │ │ │ │ # call optimizer step post hooks │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py:115 in decorate_context │ │ │ │ 112 │ @functools.wraps(func) │ │ 113 │ def decorate_context(args, kwargs): │ │ 114 │ │ with ctx_factory(): │ │ ❱ 115 │ │ │ return func(*args, kwargs) │ │ 116 │ │ │ 117 │ return decorate_context │ │ 118 │ │ │ │ /usr/local/lib/python3.8/dist-packages/bitsandbytes/optim/optimizer.py:270 in step │ │ │ │ 267 │ │ │ │ │ │ 268 │ │ │ │ self.prefetch_state(p) │ │ 269 │ │ │ │ self.update_step(group, p, gindex, pindex) │ │ ❱ 270 │ │ │ │ torch.cuda.synchronize() │ │ 271 │ │ if self.is_paged: │ │ 272 │ │ │ # all paged operation are asynchronous, we need │ │ 273 │ │ │ # to sync to make sure all tensors are in the right state │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/cuda/init.py:688 in synchronize │ │ │ │ 685 │ """ │ │ 686 │ _lazy_init() │ │ 687 │ with torch.cuda.device(device): │ │ ❱ 688 │ │ return torch._C._cuda_synchronize() │ │ 689 │ │ 690 │ │ 691 def ipc_collect(): │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: CUDA error: an illegal memory access was encountered** CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile withTORCH_USE_CUDA_DSA
to enable device-side assertions.1%|▍ | 61/10000 [48:25<131:29:38, 47.63s/it] root@5aa880cf02a4:/wzh/qlora#