#######################################################################
# PART 1 Settings #
#######################################################################
# Model
llm_name_or_path = '/app/InternLM_cqy/a_weights/LLM-Research/Meta-Llama-3-8B-Instruct'
visual_encoder_name_or_path = '/app/InternLM_cqy/a_weights/clip-vit-large-patch14-336'
# Specify the pretrained pth
pretrained_pth = '/app/InternLM_cqy/a_weights/iter_2181.pth' # noqa: E501
# Data
data_root = '/app/InternLM_cqy/Tutorial-camp2/xtuner/llava/llava_data/'
data_path = data_root + 'repeated_data.json'
image_folder = data_root
prompt_template = PROMPT_TEMPLATE.llama3_chat
max_length = int(2048 - (336 / 14)**2)
# Scheduler & Optimizer
batch_size = 1 # per_device
accumulative_counts = 1
dataloader_num_workers = 0
max_epochs = 1
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 500
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = ''
evaluation_images = 'https://llava-vl.github.io/static/images/view.jpg'
evaluation_inputs = ['Please describe this picture','What is the equipment in the image?']
然后我使用zero2出现oom适应zero3出现
Traceback (most recent call last):
File "/app/InternLM_cqy/XTuner/xtuner/tools/train.py", line 342, in <module>
main()
File "/app/InternLM_cqy/XTuner/xtuner/tools/train.py", line 338, in main
runner.train()
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/_flexible_runner.py", line 1200, in train
model = self.train_loop.run() # type: ignore
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/loops.py", line 271, in run
self.runner.call_hook('before_train')
File "/usr/local/lib/python3.10/dist-packages/mmengine/runner/_flexible_runner.py", line 1273, in call_hook
raise TypeError(f'{e} in {hook}') from e
TypeError: All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:
[(torch.Size([1, 8388608]), device(type='cuda', index=0)), (torch.Size([262144]), device(type='cpu')), (torch.Size([4096, 4096]), device(type='cuda', index=0))] in <xtuner.engine.hooks.evaluate_chat_hook.EvaluateChatHook object at 0x7fb7cbb85390>
非internstudio,使用3090显卡我使用的是脚本, 下面是我的part1
然后我使用zero2出现oom适应zero3出现
可以帮忙看一下吗?