Open radna0 opened 2 months ago
I'm trying to run the Internvl-40b model
import os
import torch
from transformers import AutoModel, AutoTokenizer
from modelscope import (
snapshot_download,
)
from swift.llm import (
get_model_tokenizer,
get_template,
inference,
get_default_template_type,
inference_stream,
)
from swift.utils import seed_everything
import numpy as np
import torch_xla.distributed.spmd as xs
import torch_xla.core.xla_model as xm
from torch_xla import runtime as xr
from torch_xla.experimental.spmd_fully_sharded_data_parallel import (
SpmdFullyShardedDataParallel as FSDPv2,
)
import time
# set environment variable: `MAX_NUM`.
os.environ["MAX_NUM"] = "1"
# xr.use_spmd()
model_type = "internvl2-40b"
template_type = get_default_template_type(model_type)
print(f"template_type: {template_type}")
model, tokenizer = get_model_tokenizer(
model_type,
torch.bfloat16,
model_id_or_path="/dev/shm/huggingface/InternVL2-40B-AWQ/",
)
# for GPUs that do not support flash attention
# model, tokenizer = get_model_tokenizer(model_type, torch.float16,
# model_kwargs={'device_map': 'auto'},
# use_flash_attn = False)
# Define the mesh and partition_spec
num_devices = xr.global_runtime_device_count()
mesh_shape = (num_devices, 1)
device_ids = np.array(range(num_devices))
# To be noted, the mesh must have an axis named 'fsdp', which the weights and activations will be sharded on.
mesh = xs.Mesh(device_ids, mesh_shape, ("fsdp", "model"))
xs.set_global_mesh(mesh)
# USE XLA FOR MODEL
# model = FSDPv2(model)
model.generation_config.max_new_tokens = 1024
template = get_template(template_type, tokenizer)
seed_everything(42)
start = time.time()
images = ["http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/road.png"]
query = "How far is it from each city?"
response, history = inference(model, template, query, images=images) # chat with image
print(f"query: {query}")
print(f"response: {response}")
print(f"____________Time elapsed: {time.time() - start:.2f}s")
"""
query: How far is it from each city?
response: The distances from the location of the sign to each city are as follows:
- Mata: 14 kilometers
- Yangjiang: 62 kilometers
- Guangzhou: 293 kilometers
These distances are indicated on the road sign in the image.
"""
Traceback (most recent call last):
File "/home/kojoe/EasyAnimate/easyanimate/image_caption/caption.py", line 77, in <module>
response, history = inference(model, template, query, images=images) # chat with image
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/kojoe/swift/swift/llm/utils/utils.py", line 761, in inference
inputs, tokenizer_kwargs, token_len, example = _prepare_inputs(
File "/home/kojoe/swift/swift/llm/utils/utils.py", line 578, in _prepare_inputs
inputs, tokenizer_kwargs = template.encode(example)
File "/home/kojoe/swift/swift/llm/utils/template.py", line 533, in encode
inputs.update(self._post_encode(data))
File "/home/kojoe/swift/swift/llm/utils/template.py", line 1781, in _post_encode
vit_embeds = self.model.extract_feature(pixel_values).to(device=device)
File "/home/kojoe/.cache/huggingface/modules/transformers_modules/modeling_internvl_chat.py", line 180, in extract_feature
vit_embeds = self.vision_model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/kojoe/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 433, in forward
hidden_states = self.embeddings(pixel_values)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/home/kojoe/.cache/huggingface/modules/transformers_modules/modeling_intern_vit.py", line 186, in forward
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
RuntimeError: Cannot set version_counter for inference tensor
We don't have TPU devices... So this is not supported yet Glad if you can give us some support😁
Happy to help with it, how can I provide support? @tastelikefeet
@tastelikefeet Do you have examples of running Inference on AWQ models with swift? I'm trying to run the AWQ version of InternVL2 models
Happy to help with it, how can I provide support? @tastelikefeet
If you have XPU devices, you can try to run swift train and inference on it, and give swift a PR :) Of course it's a hard work haha
Describe the bug What the bug is, and how to reproduce, better with screenshots(描述bug以及复现过程,最好有截图)
Your hardware and system info Write your system info like CUDA version/system/GPU/torch version here(在这里给出硬件信息和系统信息,如CUDA版本,系统,GPU型号和torch版本等)
Additional context Add any other context about the problem here(在这里补充其他信息)