InternLM / InternLM-XComposer

InternLM-XComposer-2.5: A Versatile Large Vision Language Model Supporting Long-Contextual Input and Output
Apache License 2.0
2.47k stars 153 forks source link

Multi-GPU inference for local ShareCaptioner #156

Closed zihui-debug closed 7 months ago

zihui-debug commented 8 months ago

how can I load the ShareCaptioner model on multiple GPU? I have used device_map='auto' but only one GPU works. I also used accelerate.init_empty_weights() and this is the code:

if __name__ == '__main__':
    args = parse_args()
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name, trust_remote_code=True)

    cuda_list = '1,2,3'.split(',')
    memory = '20GiB'
    NUM_GPUS = torch.cuda.device_count() if torch.cuda.is_available() else None
    print('num gpus:', NUM_GPUS)

    max_memory = {int(cuda): memory for cuda in cuda_list}
    config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
    device_dtype = torch.half if NUM_GPUS > 0 else torch.float
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(config, torch_dtype=device_dtype, trust_remote_code=True)  # 加载到meta设备中,不需要耗时,不需要消耗内存和显存

    device_map = infer_auto_device_map(model, max_memory=max_memory)  # 自动划分每个层的设备
    model = load_checkpoint_and_dispatch(model, '/root/.cache/huggingface/hub/models--Lin-Chen--ShareCaptioner/snapshots/8f9c7566789e68f07294f90ee6e6b688864f167c', device_map='auto')  # 加载权重
    model.eval()
    model.tokenizer = tokenizer

    # model.cuda()
    model.half()

    imgs = json.load(open(args.images_file, 'r'))
    part_len = len(imgs)

    seg1 = '<|User|>:'
    seg2 = f'Analyze the image in a comprehensive and detailed manner.{model.eoh}\n<|Bot|>:'
    seg_emb1 = model.encode_text(seg1, add_special_tokens=True)
    seg_emb2 = model.encode_text(seg2, add_special_tokens=False)

    captions = []

    chunk_size = len(imgs)//args.batch_size

    if len(imgs) % args.batch_size != 0:
        chunk_size += 1

    for i in range(chunk_size):
        print(f'{i}/{chunk_size}')
        subs = []
        for j in range(args.batch_size):
            if i*args.batch_size+j < len(imgs):
                img_path = imgs[i*args.batch_size+j]
                image = Image.open(img_path).convert("RGB")
                subs.append(model.vis_processor(image).unsqueeze(0))
        if len(subs) == 0:
            break
        subs = torch.cat(subs, dim=0).cuda()
        tmp_bs = subs.shape[0]
        tmp_seg_emb1 = seg_emb1.repeat(tmp_bs, 1, 1)
        tmp_seg_emb2 = seg_emb2.repeat(tmp_bs, 1, 1)
        with torch.cuda.amp.autocast():
            with torch.no_grad():
                subs = model.encode_img(subs)
                input_emb = torch.cat(
                    [tmp_seg_emb1, subs, tmp_seg_emb2], dim=1)
                out_embeds = model.internlm_model.generate(inputs_embeds=input_emb,
                                                           max_length=500,
                                                           num_beams=3,
                                                           min_length=1,
                                                           do_sample=True,
                                                           repetition_penalty=1.5,
                                                           length_penalty=1.0,
                                                           temperature=1.,
                                                           eos_token_id=model.tokenizer.eos_token_id,
                                                           num_return_sequences=1,
                                                           )
        for j, out in enumerate(out_embeds):
            out[out == -1] = 2
            response = model.decode_text([out])
            captions.append({imgs[i*args.batch_size+j]: response})

    with open(args.save_path, 'w') as f:
        json.dump(captions, f, indent=4)
    print('Done')

The error output:

[2024-02-02 22:40:38,687] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
num gpus: 8
Set max length to 20
Init VIT ... Position interpolate from 16x16 to 32x32
Done
Init Perceive Sampler ... Done
Init InternLM ... Done
0/100
Traceback (most recent call last):
  File "/root/zhengsr/InternLM-XComposer/projects/ShareGPT4V/tools/multi_infer.py", line 123, in <module>
    subs = model.encode_img(subs)
  File "/root/.cache/huggingface/modules/transformers_modules/Lin-Chen/ShareCaptioner/8f9c7566789e68f07294f90ee6e6b688864f167c/modeling_InternLM_XComposer.py", line 120, in encode_img
    inputs_internlm = torch.cat([
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:7! (when checking argument for argument tensors in method wrapper_CUDA_cat)

Is there any guidance? thank you

wp1811983038 commented 7 months ago
@zihui-debug 
def __init__(self, code_path, num_gpus=1):
    self.code_path = code_path

    # 加载 tokenizer 和模型
    tokenizer = AutoTokenizer.from_pretrained(code_path, trust_remote_code=True)
    self.chat_model = AutoModelForCausalLM.from_pretrained(code_path, device_map='cuda', trust_remote_code=True).half().eval()
    self.chat_model.tokenizer = tokenizer

    # 如果有多个 GPU,使用 DataParallel
    if torch.cuda.device_count() > 1:
        print(f"Let's use {torch.cuda.device_count()} GPUs!")
        self.chat_model = torch.nn.DataParallel(self.chat_model)

    self.chat_model.to('cuda')  # 确保模型被移到 CUDA

    stop_words_ids = [92542]
    self.stopping_criteria = get_stopping_criteria(stop_words_ids)
    set_random_seed(1234)

这样试一试,我的两块GPU可以了
zihui-debug commented 7 months ago

@wp1811983038 我是在from_pretrained加载模型那里显存不够,24G的卡,你的GPU是多大的呢

wp1811983038 commented 7 months ago

两块32G的,就成功了一次,又就出现了各种报错

panzhang0212 commented 7 months ago

@zihui-debug @wp1811983038 We have support load the ShareCaptioner model on multiple GPU, please refer to https://github.com/InternLM/InternLM-XComposer/blob/main/projects/ShareGPT4V/tools/share-cap_batch_infer.py and set --num_gpus

Free free to reopen this issue if you have any problems