Wrong cuda allocation - Githubissues

paulgavrikov commented 9 months ago

Hi,

when I try to run an inference with any MoE-LLaVA on a node with 4x A100 I run into an issue with tensor allocation:

I have installed MoE-LLaVA from the latest main commit (188d462)

Traceback (most recent call last):
  File "/workspace/models.py", line 235, in forward
    output_ids = self.model.generate(
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/transformers/generation/utils.py", line 1764, in generate
    return self.sample(
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/transformers/generation/utils.py", line 2861, in sample
    outputs = self(
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/workspace/MoE-LLaVA/moellava/model/language_model/llava_phi_moe.py", line 336, in forward
    ) = self.prepare_inputs_labels_for_multimodal(
  File "/workspace/MoE-LLaVA/moellava/model/llava_arch.py", line 198, in prepare_inputs_labels_for_multimodal
    image_features_minibatch = self.encode_images(images_minibatch)  # [mini_b, l, c]
  File "/workspace/MoE-LLaVA/moellava/model/llava_arch.py", line 153, in encode_images
    image_features = self.get_model().mm_projector.forward_image(image_features)
  File "/workspace/MoE-LLaVA/moellava/model/multimodal_projector/builder.py", line 138, in forward_image
    return self.image_spatial_proj(image_feature)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/container.py", line 217, in forward
    input = module(input)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/workspace/data/conda/envs/moellava/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

All tensors (input_ids and image tensor) including the model are located on cuda:0 before I run model.generate. Could this be related to https://github.com/haotian-liu/LLaVA/issues/769?

paulgavrikov commented 9 months ago

Just for completeness, adding device_map='cuda' in load_pretrained_model fixes this.

LinB203 commented 9 months ago

What's your command? I also meet this bug and I add --include localhost:0 to only use cuda:0 to fix it.

paulgavrikov commented 9 months ago

@LinB203 I am not using the script, I am directly accessing the model by code:

class MoE_LLaVAModel(VLMModel):

    def __init__(self, model_name):
        from moellava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
        from moellava.model.builder import load_pretrained_model
        from moellava.utils import disable_torch_init
        from moellava.mm_utils import get_model_name_from_path

        disable_torch_init()
        self.tokenizer, self.model, processor, _ = load_pretrained_model(
            model_name, 
            None, 
            get_model_name_from_path(model_name), 
            device_map='cuda',  # <----- adding this fixes the issue
            cache_dir=CACHE_DIR)
        self.model.eval()
        self.image_processor = processor['image']

        conv_mode = None
        if "stablelm" in model_name.lower():
            conv_mode = "stablelm"
        elif "phi" in model_name.lower():
            conv_mode = "phi"
        elif "qwen" in model_name.lower():
            conv_mode = "qwen"
        else:
            raise ValueError(f"Unknown conversation {model_name}")

        self.conv_mode = conv_mode

    @staticmethod
    def get_MoE_LLaVA_StableLM():
        return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-StableLM-1.6B-4e')

    @staticmethod
    def get_MoE_LLaVA_Qwen():
        return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Qwen-1.8B-4e')

    @staticmethod
    def get_MoE_LLaVA_Phi2():
        return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Phi2-2.7B-4e')

    @staticmethod
    def get_MoE_LLaVA_Phi2_384():
        return MoE_LLaVAModel('LanguageBind/MoE-LLaVA-Phi2-2.7B-4e-384')

    def forward(self, prompt: str, image_path: str) -> dict:
        from moellava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
        from moellava.conversation import conv_templates, SeparatorStyle
        from moellava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria

        image = Image.open(image_path).convert('RGB')

        conv = conv_templates[self.conv_mode].copy()
        image_tensor = self.image_processor.preprocess(image, return_tensors='pt')[
            'pixel_values'].to(self.model.device, dtype=torch.float16)

        inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
        conv.append_message(conv.roles[0], inp)
        conv.append_message(conv.roles[1], None)
        prompt = conv.get_prompt()
        input_ids = tokenizer_image_token(
            prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.model.device)
        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
        keywords = [stop_str]
        stopping_criteria = KeywordsStoppingCriteria(
            keywords, self.tokenizer, input_ids)

        with torch.inference_mode():
            output_ids = self.model.generate(
                input_ids,
                images=image_tensor,
                do_sample=True,
                temperature=0.2,
                max_new_tokens=1024,
                use_cache=True,
                stopping_criteria=[stopping_criteria])

        outputs = self.tokenizer.decode(
            output_ids[0, input_ids.shape[1]:], skip_special_tokens=True).strip()
        return {'response': outputs}

LinB203 commented 9 months ago

I wonder that which gpu will run if add device_map='cuda' ? Is only the cuda:0 (even you may have 2 or more GPUs)? Btw, nice code recipe.

paulgavrikov commented 9 months ago

Thanks! And yes, indeed it is only running on cuda:0 then

PKU-YuanGroup / MoE-LLaVA

Wrong cuda allocation #23