for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images

import cv2
from openai import OpenAI

client = OpenAI(base_url='http://<ip>:80/v1')
model="OpenGVLab/InternVL2-26B"
#client = OpenAI(base_url='http://<ip>:80/v1')
#model = 'OpenGVLab/InternVL-Chat-V1-5'

prompt = """<response_instructions>
- Act as a keen observer with a sharp eye for detail.
- Analyze the content within the images.
- Provide insights based on your observations.
- Avoid making up facts.
- Finally, according to our chat history, above documents, above figure captions, or given images, generate a well-structured response.
</response_instructions>
What tower do you see in the image?
"""

from PIL import Image
import base64
import requests
from io import BytesIO

# The encoding function I linked previously - but we actually don't use this function in the API server
def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
    """encode image to base64 format."""

    buffered = BytesIO()
    if format == 'JPEG':
        image = image.convert('RGB')
    image.save(buffered, format)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

# This is what we use in the API server to load the base64 string to image
def load_image_from_base64(image: str):
    """Load image from base64 format."""
    return Image.open(BytesIO(base64.b64decode(image)))

image1 = '/tmp/image_file_764ae7bd-6b02-4ffb-b9d6-83e754c30952.jpeg'
image2 = '/tmp/image_file_1bfb88ea-a545-4b1f-a31f-051dbb90a378.jpeg'
image3 = '/tmp/image_file_ac5589e7-92a3-470f-a933-40d6bad38052.jpeg'

#from PIL import Image

def remove_padding(image_path, output_path, background_color=(255, 255, 255)):
    # Read the image
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply a binary threshold to get a binary image
    _, binary = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY)

    # Invert the binary image
    inverted_binary = cv2.bitwise_not(binary)

    # Find contours
    contours, _ = cv2.findContours(inverted_binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Get the bounding box of the largest contour
    x, y, w, h = cv2.boundingRect(contours[0])
    for contour in contours:
        x1, y1, w1, h1 = cv2.boundingRect(contour)
        if w1 * h1 > w * h:
            x, y, w, h = x1, y1, w1, h1

    # Crop the image to the bounding box
    cropped_image = image[y:y+h, x:x+w]

    # Save the cropped image
    cv2.imwrite(output_path, cropped_image)

# Example usage
if False:
    ext = 'b.jpg'
    remove_padding(image1, image1 + ext)
    remove_padding(image2, image2 + ext)
    remove_padding(image3, image3 + ext)
else:
    ext = ''

image1_64 = base64.b64encode(open(image1 + ext, 'rb').read()).decode('utf-8')
image2_64 = base64.b64encode(open(image2 + ext, 'rb').read()).decode('utf-8')
image3_64 = base64.b64encode(open(image3 + ext, 'rb').read()).decode('utf-8')

system_prompt = "You are h2oGPTe, an expert question-answering AI system created by H2O.ai that performs like GPT-4 by OpenAI."

messages = [
    #{'role': 'system', 'content': system_prompt},
    {
        'role': 'user',
        'content': [
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image1_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image2_64,
                }
             },
            {'type': 'image_url',
             'image_url': {
                'url': 'data:image/jpeg;base64,' + image3_64,
                }
             },
            {'type': 'text', 'text': prompt},
        ],
    }
]

response = client.chat.completions.create(
    model=model,
    messages=messages,
    max_tokens=300,
    temperature=0.0,
)

print(response.choices[0])

gives:

The image does not show a tower. Instead, it shows two separate items:\n\n1. A receipt from a shopping store.\n2. A cake with a message congratulating Kate and Duke on their upcoming arrival.\n\nIf you have any specific questions about these items, please let me know!

h2oai / h2ogpt

for image batching, do parallel jobs instead of many images per batch, some models just not good enough to handle multiple images #1737