Closed qijishinidie closed 6 months ago
hello bro, I also meet this problem about "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!", could you tell me how to deal with it?
I am getting the same error when using DataParallel with the model.
Next is the script I will use
import os GPU_NUMBER = [1,2,6,8,9] os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER]) import torch from torch.utils.data import DataLoader from transformers import AutoConfig, AutoModelForCausalLM, TrainingArguments, Trainer from transformers import AutoTokenizer, DataCollatorForLanguageModeling from datasets import Dataset, load_dataset, DatasetDict from Bio import SeqIO from accelerate import Accelerator
accelerator = Accelerator()
model_name = 'togethercomputer/evo-1-8k-base'
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained( model_name, config=model_config, trust_remote_code=True, )
model = accelerator.prepare(model)
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/evo-1-8k-base", trust_remote_code=True) tokenizer.pad_token = "N"
train_ds = load_dataset('csv', data_files = 'all_fastas.csv')
def preprocess_function(sample): return tokenizer(sample['Seq'], padding="max_length", truncation=True, max_length=500)
tokenized_ds = train_ds.map( preprocess_function, batched=True, num_proc=4, )
train_testvalid = tokenized_ds['train'].train_test_split(test_size=0.2)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, gradient_accumulation_steps=2, per_device_train_batch_size=1, warmup_steps=10, max_steps=10000, # 仅示例 logging_steps=10, eval_steps=100, bf16=True, )
trainer = Trainer( model=model, args=training_args, train_dataset=train_testvalid["train"], eval_dataset=train_testvalid["test"], data_collator=data_collator, )
trainer.train()
print(trainer.evaluate())
trainer.save_model('./finetuned.model_all_par')
Below is an error message
Traceback (most recent call last): File "/home/qj/python-project/evo/weitiao.py", line 76, in
trainer.train()
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/transformers/trainer.py", line 1885, in train
return inner_training_loop(
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/transformers/trainer.py", line 3238, in training_step
loss = self.compute_loss(model, inputs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/transformers/trainer.py", line 3264, in compute_loss
outputs = model(inputs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(input, kwargs)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/modeling_hyena.py", line 109, in forward
logits, past_key_values = self.backbone(
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 363, in forward
x, inference_params_dict_out = self.stateless_forward(x, padding_mask=padding_mask)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 382, in statelessforward
x, = block(x, inference_params=None, padding_mask=padding_mask)
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 304, in forward
z = self.proj_norm_fn(u)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/model.py", line 298, in proj_norm
return self.projections(self.pre_norm(x))
File "/home/qj/anaconda3/envs/new_evo/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/qj/.cache/huggingface/modules/transformers_modules/togethercomputer/evo-1-131k-base/567369e9825aa08b3de4b122fca34fac6a890602/layers.py", line 40, in forward
return self.scale y
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!