h2oai / h2ogpt

Private chat with local GPT with document, images, video, etc. 100% private, Apache 2.0. Supports oLLaMa, Mixtral, llama.cpp, and more. Demo: https://gpt.h2o.ai/ https://gpt-docs.h2o.ai/
http://h2o.ai
Apache License 2.0
11.19k stars 1.23k forks source link

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737

Open pseudotensor opened 1 month ago

pseudotensor commented 1 month ago
import cv2
from openai import OpenAI

client = OpenAI(base_url='http://<ip>:80/v1')
model="OpenGVLab/InternVL2-26B"
#client = OpenAI(base_url='http://<ip>:80/v1')
#model = 'OpenGVLab/InternVL-Chat-V1-5'

prompt = """<response_instructions>
- Act as a keen observer with a sharp eye for detail.
- Analyze the content within the images.
- Provide insights based on your observations.
- Avoid making up facts.
- Finally, according to our chat history, above documents, above figure captions, or given images, generate a well-structured response.
</response_instructions>
What tower do you see in the image?
"""

from PIL import Image
import base64
import requests
from io import BytesIO

# The encoding function I linked previously - but we actually don't use this function in the API server
def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
    """encode image to base64 format."""

    buffered = BytesIO()
    if format == 'JPEG':
        image = image.convert('RGB')
    image.save(buffered, format)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

# This is what we use in the API server to load the base64 string to image
def load_image_from_base64(image: str):
    """Load image from base64 format."""
    return Image.open(BytesIO(base64.b64decode(image)))

image1 = '/tmp/image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952.jpeg'
image2 = '/tmp/image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378.jpeg'
image3 = '/tmp/image_file_ac5589e7-92a3-470f-a933-40d6bad38052.jpeg'

#from PIL import Image

def remove_padding(image_path, output_path, background_color=(255, 255, 255)):
    # Read the image
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold to get a binary image
    _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)

    # Invert the binary image
    inverted_binary = cv2.bitwise_not(binary)

    # Find contours
    contours, _ = cv2.findContours(inverted_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Get the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(contours[0])
    for contour in contours:
        x1, y1, w1, h1 = cv2.boundingRect(contour)
        if w1 * h1 > w * h:
            x, y, w, h = x1, y1, w1, h1

    # Crop the image to the bounding box
    cropped_image = image[y:y+h, x:x+w]

    # Save the cropped image
    cv2.imwrite(output_path, cropped_image)

# Example usage
if False:
    ext = 'b.jpg'
    remove_padding(image1, image1 + ext)
    remove_padding(image2, image2 + ext)
    remove_padding(image3, image3 + ext)
else:
    ext = ''

image1_64 = base64.b64encode(open(image1 + ext, 'rb').read()).decode('utf-8')
image2_64 = base64.b64encode(open(image2 + ext, 'rb').read()).decode('utf-8')
image3_64 = base64.b64encode(open(image3 + ext, 'rb').read()).decode('utf-8')

system_prompt = "You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI."

messages = [
    #{'role': 'system', 'content': system_prompt},
    {
        'role': 'user',
        'content': [
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image1_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image2_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image3_64,
                }
             },
            {'type': 'text', 'text': prompt},
        ],
    }
]

response = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=300,
    temperature=0.0,
)

print(response.choices[0])

gives:

The image does not show a tower. Instead, it shows two separate items:\n\n1. A receipt from a shopping store.\n2. A cake with a message congratulating Kate and Duke on their upcoming arrival.\n\nIf you have any specific questions about these items, please let me know!
pseudotensor commented 1 month ago

image_file_ac5589e7-92a3-470f-a933-40d6bad38052 image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378 image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952