OpenGVLab / InternVL

[CVPR 2024 Oral] InternVL Family: A Pioneering Open-Source Alternative to GPT-4o. 接近GPT-4o表现的开源多模态对话模型
https://internvl.readthedocs.io/en/latest/
MIT License
5.95k stars 462 forks source link

Repeat answer. #101

Closed LinB203 closed 6 months ago

LinB203 commented 6 months ago
import json
import os
# from internvl.model.internvl_chat import InternVLChatModel
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch
import torchvision.transforms as T
from PIL import Image

from torchvision.transforms.functional import InterpolationMode

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

path = "/remote-home1/yeyang/InternLM-XComposer/projects/InternVL/InternVL-Chat-V1-5"
# If you have an 80G A100 GPU, you can put the entire model on a single GPU.
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).eval().cuda()
# Otherwise, you need to set device_map='auto' to use multiple GPUs for inference.
# model = AutoModel.from_pretrained(
#     path,
#     torch_dtype=torch.bfloat16,
#     low_cpu_mem_usage=True,
#     trust_remote_code=True,
#     device_map='auto').eval()

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
# set the max number of tiles in `max_num`
pixel_values = load_image('/remote-home1/dataset/images/00000/000009967.jpg', max_num=6).to(torch.bfloat16).cuda()

generation_config = dict(
    num_beams=1,
    max_new_tokens=512,
    do_sample=False,
)

# single-round conversation
question = "Describe the image in a comprehensive and detailed manner."
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(response)

Output:

<|im_start|>system
You are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user
<image>
Describe the image in a comprehensive and detailed manner.<|im_end|><|im_start|>assistant
 In the image, a woman is the central figure, elegantly dressed in a white wedding gown. The gown is strapless, featuring a sweetheart neckline that adds a touch of romance to her ensemble. The dress is adorned with lace details, giving it a classic and timeless appeal. A long train extends from the back of the dress, adding a dramatic flair to her attire.

She is seated on a wooden chair, her posture relaxed yet poised. Her left hand rests gently on her thigh, while her right hand is placed on the arm of the chair. The chair is positioned in front of a wooden table, which holds a vase filled with white flowers, adding a touch of nature to the scene.

The backdrop is a mural painted in warm hues of brown and gold, providing a rich and inviting atmosphere. The woman's blonde hair is styled in an updo, and she wears a veil that cascades down her back, completing her bridal look. The image captures a moment of serene beauty, as the woman in her wedding gown sits gracefully in front of the mural.
In the image, a woman is the central figure, elegantly dressed in a white wedding gown. The gown is strapless, featuring a sweetheart neckline that adds a touch of romance to her ensemble. The dress is adorned with lace details, giving it a classic and timeless appeal. A long train extends from the back of the dress, adding a dramatic flair to her attire.

She is seated on a wooden chair, her posture relaxed yet poised. Her left hand rests gently on her thigh, while her right hand is placed on the arm of the chair. The chair is positioned in front of a wooden table, which holds a vase filled with white flowers, adding a touch of nature to the scene.

The backdrop is a mural painted in warm hues of brown and gold, providing a rich and inviting atmosphere. The woman's blonde hair is styled in an updo, and she wears a veil that cascades down her back, completing her bridal look. The image captures a moment of serene beauty, as the woman in her wedding gown sits gracefully in front of the mural.

000009967

FailSpy commented 6 months ago

Default behavior on model.chat is to print the response, and then you're printing it after. You'll see if you remove/comment out your print that it will still print the response, but just once.

czczup commented 6 months ago

Default behavior on model.chat is to print the response, and then you're printing it after. You'll see if you remove/comment out your print that it will still print the response, but just once.

Yes, that's right.