Open paulgavrikov opened 9 months ago
Just for completeness, adding device_map='cuda'
in load_pretrained_model
fixes this.
What's your command? I also meet this bug and I add --include localhost:0
to only use cuda:0 to fix it.
@LinB203 I am not using the script, I am directly accessing the model by code:
class MoE_LLaVAModel(VLMModel):
def __init__(self, model_name):
from moellava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from moellava.model.builder import load_pretrained_model
from moellava.utils import disable_torch_init
from moellava.mm_utils import get_model_name_from_path
disable_torch_init()
self.tokenizer, self.model, processor, _ = load_pretrained_model(
model_name,
None,
get_model_name_from_path(model_name),
device_map='cuda', # <----- adding this fixes the issue
cache_dir=CACHE_DIR)
self.model.eval()
self.image_processor = processor['image']
conv_mode = None
if "stablelm" in model_name.lower():
conv_mode = "stablelm"
elif "phi" in model_name.lower():
conv_mode = "phi"
elif "qwen" in model_name.lower():
conv_mode = "qwen"
else:
raise ValueError(f"Unknown conversation {model_name}")
self.conv_mode = conv_mode
@staticmethod
def get_MoE_LLaVA_StableLM():
return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-StableLM-1.6B-4e')
@staticmethod
def get_MoE_LLaVA_Qwen():
return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Qwen-1.8B-4e')
@staticmethod
def get_MoE_LLaVA_Phi2():
return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Phi2-2.7B-4e')
@staticmethod
def get_MoE_LLaVA_Phi2_384():
return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Phi2-2.7B-4e-384')
def forward(self, prompt: str, image_path: str) -> dict:
from moellava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from moellava.conversation import conv_templates, SeparatorStyle
from moellava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
image = Image.open(image_path).convert('RGB')
conv = conv_templates[self.conv_mode].copy()
image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
'pixel_values'].to(self.model.device, dtype=torch.float16)
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(
prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(
keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor,
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
outputs = self.tokenizer.decode(
output_ids[0, input_ids.shape[1]:], skip_special_tokens=True).strip()
return {'response': outputs}
I wonder that which gpu will run if add device_map='cuda'
? Is only the cuda:0 (even you may have 2 or more GPUs)?
Btw, nice code recipe.
Thanks! And yes, indeed it is only running on cuda:0
then
Hi,
when I try to run an inference with any MoE-LLaVA on a node with 4x A100 I run into an issue with tensor allocation:
I have installed MoE-LLaVA from the latest main commit (188d462)
All tensors (input_ids and image tensor) including the model are located on
cuda:0
before I runmodel.generate
. Could this be related to https://github.com/haotian-liu/LLaVA/issues/769?