ValueError: wrong pixel_values size: torch.Size([1, 1]) when running DataPipelineLLM.py. [Bug]

Checklist

[X] 1. I have searched related issues but cannot get the expected help.
[X] 2. The bug has not been fixed in the latest version.
[x] 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.

Describe the bug

When processing a PDF file, the script extracts images from the PDF and passes them to the model to generate captions and descriptions. However, the pixel_values tensor is incorrectly formed, resulting in a shape of [1, 1]. This causes a ValueError in the vision model’s forward pass, indicating that the pixel_values size is incorrect.

Reproduction

Setup and activate python env.


conda create -n env python=3.11
conda activate env

pip install transformers==4.37.2 flask psutil pillow pytesseract torch accelerate torchvision unstructured pdfminer.six pillow_heif unstructured-inference layoutparser opencv-python-headless sentencepiece einops flash_attn unstructured_pytesseract pillow_heif

sudo apt-get update
sudo apt-get install poppler-utils

sudo apt-get install tesseract-ocr

mkdir -p offload

Prepare the DataPipelineLLM.py script:

Use the following script for DataPipelineLLM.py:


import os
import logging
import json
import torch
import psutil
from flask import Flask, request, jsonify
from PIL import Image
from io import BytesIO
import pytesseract
from transformers import AutoTokenizer, AutoModel
import torchvision.transforms as T
from torchvision.transforms.functional import InterpolationMode
from unstructured.partition.pdf import partition_pdf

# Configure logging
logging.basicConfig(level=logging.INFO)

# Initialize Flask app
app = Flask(__name__)

# Function to log memory usage
def log_memory_usage():
    process = psutil.Process()
    mem_info = process.memory_info()
    logging.info(f"RSS: {mem_info.rss / (1024 ** 2):.2f} MB")

log_memory_usage()

# Initialize device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model configuration
tokenizer = AutoTokenizer.from_pretrained("OpenGVLab/InternVL-Chat-V1-5", trust_remote_code=True)
model = AutoModel.from_pretrained(
    "OpenGVLab/InternVL-Chat-V1-5",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True
).eval().to(device)

def build_transform(input_size):
    IMAGENET_MEAN = (0.485, 0.456, 0.406)
    IMAGENET_STD = (0.229, 0.224, 0.225)
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])
    return transform

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    target_ratios = sorted(
        ((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
         if i * j <= max_num and i * j >= min_num), key=lambda x: x[0] * x[1]
    )

    target_aspect_ratio = min(target_ratios, key=lambda x: abs((x[0] / x[1]) - aspect_ratio))
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        split_img = resized_img.crop(box)
        processed_images.append(split_img)

    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)

    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    logging.info(f"Number of processed images: {len(images)}")  # Log the number of processed images
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    logging.info(f"Loaded image tensor shape: {pixel_values.shape}")  # Log the shape of pixel_values
    return pixel_values

def extract_content_from_pdf(file_path):
    elements = partition_pdf(file_path)
    text = "\n".join([str(element) for element in elements if hasattr(element, 'type') and element.type == 'Text'])
    images = [element for element in elements if hasattr(element, 'type') and element.type == 'Image']
    return text, images

def ocr_image(image_bytes):
    image = Image.open(BytesIO(image_bytes))
    text = pytesseract.image_to_string(image)
    return text

def generate_text(prompt, max_length=150):
    if model:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        logging.info(f"Input IDs shape: {inputs.input_ids.shape}")  # Log the shape of input_ids
        if not hasattr(model, 'img_context_token_id') or model.img_context_token_id is None:
            model.img_context_token_id = tokenizer.convert_tokens_to_ids("<image>")
        outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1)
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return text
    else:
        return "Model loading failed. Unable to generate text."

def caption_image(image_bytes):
    image = Image.open(BytesIO(image_bytes))
    pixel_values = load_image(image_bytes)
    image_description = "A description of the image for captioning purposes."
    prompt = f"<image>\nGenerate a caption for the following image description: {image_description}"
    caption = generate_text(prompt)
    return caption

def explain_image(caption):
    prompt = f"Explain this image based on the caption: {caption}"
    explanation = generate_text(prompt)
    return explanation

def process_pdf(file_path):
    text, images = extract_content_from_pdf(file_path)
    logging.info(f"Extracted text: {text[:100]}")  # Log the beginning of the extracted text
    processed_text = generate_text(text)

    ocr_results = [ocr_image(img) for img in images]
    captions = [caption_image(img) for img in images]
    explanations = [explain_image(caption) for caption in captions]

    data = {
        "text": processed_text,
        "images": [{"image": img, "ocr_text": ocr, "caption": caption, "explanation": explanation}
                   for img, ocr, caption, explanation in zip(images, ocr_results, captions, explanations)]
    }

    return data

def store_data_locally(data, output_folder, file_name):
    output_path = os.path.join(output_folder, f"{file_name}.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    logging.info(f"Saved data to {output_path}")

@app.route('/process', methods=['POST'])
def process_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No file selected for uploading'}), 400
    if file and file.filename.endswith('.pdf'):
        input_folder = "./input_files"
        output_folder = "./output_files"

        if not os.path.exists(input_folder):
            os.makedirs(input_folder)
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        file_path = os.path.join(input_folder, file.filename)
        file.save(file_path)

        data = process_pdf(file_path)
        file_name = os.path.splitext(file.filename)[0]
        store_data_locally(data, output_folder, file_name)

        return jsonify({'message': f'File processed successfully and saved as {file_name}.json'}), 200
    else:
        return jsonify({'error': 'Allowed file types are pdf'}), 400

if __name__ == "__main__":
    app.run(host='0.0.0.0', port=5000)

After running the script send the request using curl or postman


curl -X POST -F 'file=@./datatext.pdf' http://127.0.0.1:5000/process

Environment

(env) user@5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:~$ env
SHELL=/bin/bash
CONDA_MKL_INTERFACE_LAYER_BACKUP=
CONDA_EXE=/home/user/mambaforge/bin/conda
_CE_M=
XML_CATALOG_FILES=file:///home/user/mambaforge/envs/env/etc/xml/catalog file:///etc/xml/catalog
PWD=/home/user
GSETTINGS_SCHEMA_DIR=/home/user/mambaforge/envs/env/share/glib-2.0/schemas
LOGNAME=user
XDG_SESSION_TYPE=tty
CONDA_PREFIX=/home/user/mambaforge/envs/env
JUPYTER_SERVER_URL=http://5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:8888/
GSETTINGS_SCHEMA_DIR_CONDA_BACKUP=
MOTD_SHOWN=pam
LINES=48
HOME=/home/user
LANG=C.UTF-8
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
COLUMNS=147
CONDA_PROMPT_MODIFIER=(env) 
LC_TERMINAL=iTerm2
SSH_CONNECTION=154.192.139.31 34046 192.168.122.135 22
LESSCLOSE=/usr/bin/lesspipe %s %s
XDG_SESSION_CLASS=user
JUPYTER_SERVER_ROOT=/home/user
TERM=xterm-256color
_CE_CONDA=
LESSOPEN=| /usr/bin/lesspipe %s
USER=user
CONDA_SHLVL=1
LC_TERMINAL_VERSION=3.5.3
SHLVL=2
PYXTERM_DIMENSIONS=80x25
XDG_SESSION_ID=8
CONDA_PYTHON_EXE=/home/user/mambaforge/bin/python
LD_LIBRARY_PATH=/usr/local/cuda-12.5/lib64:/usr/local/cuda-12.5/lib64:
XDG_RUNTIME_DIR=/run/user/1000
SSH_CLIENT=xxx.xxx.xxx.xx 34046 22
CONDA_DEFAULT_ENV=env
XDG_DATA_DIRS=/usr/local/share:/usr/share:/var/lib/snapd/desktop
PATH=/usr/local/cuda-12.5/bin:/home/user/mambaforge/envs/env/bin:/usr/local/cuda-12.5/bin:/home/user/mambaforge/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus
SSH_TTY=/dev/pts/0
MKL_INTERFACE_LAYER=LP64,GNU
_=/usr/bin/env
(env) user@5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:~$ 

(env) user@5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:~$ pip list
Package                   Version
------------------------- --------------
absl-py                   2.1.0
accelerate                0.33.0
aiohttp                   3.9.5
aiosignal                 1.3.1
annotated-types           0.7.0
anyio                     4.4.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
astunparse                1.6.3
async-lru                 2.0.4
attrs                     23.2.0
Babel                     2.14.0
backoff                   2.2.1
beautifulsoup4            4.12.3
bleach                    6.1.0
blinker                   1.8.2
blis                      0.7.10
Bottleneck                1.3.5
Brotli                    1.1.0
cached-property           1.5.2
cachetools                5.3.3
catalogue                 2.0.10
certifi                   2024.6.2
cffi                      1.16.0
chardet                   5.2.0
charset-normalizer        3.3.2
click                     8.1.7
cloudpathlib              0.18.1
colorama                  0.4.6
coloredlogs               15.0.1
comm                      0.2.2
confection                0.1.4
contourpy                 1.2.1
cryptography              42.0.8
cycler                    0.12.1
cymem                     2.0.8
dataclasses-json          0.6.7
debugpy                   1.8.2
decorator                 5.1.1
deepdiff                  7.0.1
defusedxml                0.7.1
einops                    0.8.0
emoji                     2.12.1
entrypoints               0.4
exceptiongroup            1.2.0
executing                 2.0.1
fastai                    2.7.15
fastcore                  1.5.48
fastdownload              0.0.7
fastjsonschema            2.20.0
fastprogress              1.0.3
filelock                  3.15.4
filetype                  1.2.0
flash-attn                2.6.3
Flask                     3.0.3
flatbuffers               24.3.25
fonttools                 4.53.1
fqdn                      1.5.1
frozenlist                1.4.1
fsspec                    2024.6.1
gast                      0.5.5
gmpy2                     2.1.5
google-auth               2.31.0
google-auth-oauthlib      1.0.0
google-pasta              0.2.0
grpcio                    1.54.3
h11                       0.14.0
h2                        4.1.0
h5py                      3.11.0
hpack                     4.0.0
httpcore                  1.0.5
httpx                     0.27.0
huggingface-hub           0.24.3
humanfriendly             10.0
hyperframe                6.0.1
idna                      3.7
importlib_metadata        8.0.0
importlib_resources       6.4.0
iopath                    0.1.10
ipykernel                 6.29.5
ipython                   8.26.0
ipywidgets                8.1.3
isoduration               20.11.0
itsdangerous              2.2.0
jedi                      0.19.1
Jinja2                    3.1.4
joblib                    1.4.2
json5                     0.9.25
jsonpath-python           1.0.6
jsonpointer               3.0.0
jsonschema                4.22.0
jsonschema-specifications 2023.12.1
jupyter                   1.0.0
jupyter_client            8.6.2
jupyter-console           6.6.3
jupyter_core              5.7.2
jupyter-events            0.10.0
jupyter-lsp               2.2.5
jupyter_server            2.14.1
jupyter_server_terminals  0.5.3
jupyterlab                4.2.3
jupyterlab_pygments       0.3.0
jupyterlab_server         2.27.2
jupyterlab_widgets        3.0.11
keras                     2.14.0
kiwisolver                1.4.5
langcodes                 3.4.0
langdetect                1.0.9
language_data             1.2.0
layoutparser              0.3.4
lxml                      5.2.2
marisa-trie               1.1.0
Markdown                  3.6
markdown-it-py            3.0.0
MarkupSafe                2.1.5
marshmallow               3.21.3
matplotlib                3.8.4
matplotlib-inline         0.1.7
mdurl                     0.1.2
mistune                   3.0.2
ml-dtypes                 0.2.0
mpmath                    1.3.0
multidict                 6.0.5
munkres                   1.1.4
murmurhash                1.0.10
mypy-extensions           1.0.0
nbclient                  0.10.0
nbconvert                 7.16.4
nbformat                  5.10.4
nest_asyncio              1.6.0
networkx                  3.3
nltk                      3.8.1
notebook                  7.2.1
notebook_shim             0.2.4
numexpr                   2.10.0
numpy                     1.26.4
oauthlib                  3.2.2
onnx                      1.16.1
onnxruntime               1.18.1
opencv-python             4.10.0.84
opt-einsum                3.3.0
ordered-set               4.1.0
overrides                 7.7.0
packaging                 24.1
pandas                    2.1.1
pandocfilters             1.5.0
parso                     0.8.4
pdf2image                 1.17.0
pdfminer.six              20231228
pdfplumber                0.11.2
pexpect                   4.9.0
pickleshare               0.7.5
pillow                    10.3.0
pillow_heif               0.18.0
pip                       24.0
pkgutil_resolve_name      1.3.10
platformdirs              4.2.2
ply                       3.11
portalocker               2.10.1
preshed                   3.0.9
prometheus_client         0.20.0
prompt_toolkit            3.0.47
protobuf                  4.21.12
psutil                    6.0.0
ptyprocess                0.7.0
pure-eval                 0.2.2
pyasn1                    0.6.0
pyasn1_modules            0.4.0
pycparser                 2.22
pydantic                  2.8.2
pydantic_core             2.20.1
Pygments                  2.18.0
PyJWT                     2.8.0
pyOpenSSL                 24.0.0
pyparsing                 3.1.2
pypdf                     4.3.1
pypdfium2                 4.30.0
PyQt5                     5.15.9
PyQt5-sip                 12.12.2
PySocks                   1.7.1
pytesseract               0.3.10
python-dateutil           2.9.0
python-iso639             2024.4.27
python-json-logger        2.0.7
python-magic              0.4.27
python-multipart          0.0.9
pytz                      2024.1
pyu2f                     0.1.5
PyYAML                    6.0.1
pyzmq                     26.0.3
qtconsole                 5.5.2
QtPy                      2.4.1
rapidfuzz                 3.9.5
referencing               0.35.1
regex                     2024.7.24
requests                  2.32.3
requests-oauthlib         2.0.0
requests-toolbelt         1.0.0
rfc3339-validator         0.1.4
rfc3986-validator         0.1.1
rich                      13.7.1
rpds-py                   0.18.1
rsa                       4.9
safetensors               0.4.3
scikit-learn              1.5.1
scipy                     1.14.0
Send2Trash                1.8.3
sentencepiece             0.2.0
setuptools                70.1.1
shellingham               1.5.4
sip                       6.7.12
six                       1.16.0
smart_open                7.0.4
sniffio                   1.3.1
soupsieve                 2.5
spacy                     3.7.5
spacy-legacy              3.0.12
spacy-loggers             1.0.5
srsly                     2.4.8
stack-data                0.6.2
sympy                     1.12.1
tabulate                  0.9.0
tensorboard               2.14.1
tensorboard-data-server   0.7.0
tensorflow                2.14.0
tensorflow-estimator      2.14.0
termcolor                 2.4.0
terminado                 0.18.1
thinc                     8.2.3
threadpoolctl             3.5.0
timm                      1.0.8
tinycss2                  1.3.0
tokenizers                0.15.2
toml                      0.10.2
tomli                     2.0.1
torch                     2.3.1
torchaudio                2.3.1
torchvision               0.18.1
tornado                   6.4.1
tqdm                      4.66.4
traitlets                 5.14.3
transformers              4.37.2
triton                    2.3.1
typer                     0.12.3
typer-slim                0.12.3
types-python-dateutil     2.9.0.20240316
typing_extensions         4.12.2
typing-inspect            0.9.0
typing-utils              0.1.0
tzdata                    2023.3
unstructured              0.15.0
unstructured-client       0.25.0
unstructured-inference    0.7.36
unstructured.pytesseract  0.3.12
uri-template              1.3.0
urllib3                   2.2.2
wasabi                    1.1.2
wcwidth                   0.2.13
weasel                    0.4.1
webcolors                 24.6.0
webencodings              0.5.1
websocket-client          1.8.0
Werkzeug                  3.0.3
wheel                     0.43.0
widgetsnbextension        4.0.11
wrapt                     1.14.1
yarl                      1.9.4
zipp                      3.19.2
zstandard                 0.22.0
(env) user@5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:~$

Error traceback

(env) user@5cb2b72b-4b1d-4df3-8361-1cd4aff1550f:~$ python DataPipelineLLM.py
INFO:root:RSS: 693.44 MB
/home/user/mambaforge/envs/env/lib/python3.11/site-packages/huggingface_hub/file_download.py:1150: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.configuration_internvl_chat:vision_select_layer: -1
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.configuration_internvl_chat:ps_version: v2
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.configuration_internvl_chat:min_dynamic_patch: 1
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.configuration_internvl_chat:max_dynamic_patch: 12
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.modeling_internvl_chat:num_image_token: 256
INFO:transformers_modules.OpenGVLab.InternVL-Chat-V1-5.7e2100537fee41c08e1d0c1436179328dfc4396d.modeling_internvl_chat:ps_version: v2
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.49it/s]
 * Serving Flask app 'DataPipelineLLM'
 * Debug mode: off
INFO:werkzeug:WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.122.135:5000
INFO:werkzeug:Press CTRL+C to quit
INFO:unstructured:PDF text extraction failed, skip text extraction...

INFO:root:Extracted text: 
INFO:root:Input IDs shape: torch.Size([1, 1])
ERROR:DataPipelineLLM:Exception on /process [POST]
Traceback (most recent call last):
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/flask/app.py", line 1473, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/flask/app.py", line 882, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/flask/app.py", line 880, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/flask/app.py", line 865, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)  # type: ignore[no-any-return]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/DataPipelineLLM.py", line 171, in process_file
    data = process_pdf(file_path)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/DataPipelineLLM.py", line 133, in process_pdf
    processed_text = generate_text(text)
                     ^^^^^^^^^^^^^^^^^^^
  File "/home/user/DataPipelineLLM.py", line 111, in generate_text
    outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL-Chat-V1-5/7e2100537fee41c08e1d0c1436179328dfc4396d/modeling_internvl_chat.py", line 321, in generate
    vit_embeds = self.extract_feature(pixel_values)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL-Chat-V1-5/7e2100537fee41c08e1d0c1436179328dfc4396d/modeling_internvl_chat.py", line 181, in extract_feature
    vit_embeds = self.vision_model(
                 ^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/mambaforge/envs/env/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/user/.cache/huggingface/modules/transformers_modules/OpenGVLab/InternVL-Chat-V1-5/7e2100537fee41c08e1d0c1436179328dfc4396d/modeling_intern_vit.py", line 418, in forward
    raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
ValueError: wrong pixel_values size: torch.Size([1, 1])
INFO:werkzeug:127.0.0.1 - - [31/Jul/2024 09:16:02] "POST /process HTTP/1.1" 500 -

OpenGVLab / InternVL

ValueError: wrong pixel_values size: torch.Size([1, 1]) when running DataPipelineLLM.py. [Bug] #438

Checklist

Describe the bug

Reproduction

Environment

Error traceback