Why the result produce from the model on demo huggingface different with the model inference in local

Hey, thanks for your work, but i have a question, when i use demo on huggingface it produce the result different than when i use it locally I use beam search with numbeam = 3, repetition penalty = 1.2, with the same prompt and same input, weight from link https://huggingface.co/openbmb/MiniCPM-V-2, and i saw that the result from huggingface demo is better than local

My GPU card is not enought to load weight so i use accelerate and tutorial from this link https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md to load and inference

Below is my using script:

# test.py
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from accelerate import dispatch_model, infer_auto_device_map

def load_model(checkpoint):
    from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_in_model, dispatch_model
    from transformers import AutoConfig

    max_memory_each_gpu = '10GiB' # Define the maximum memory to use on each gpu, here we suggest using a balanced value, because the weight is not everything, the intermediate activation value also uses GPU memory (10GiB < 16GiB)
    gpu_device_ids = [0, 1, 2] # Define which gpu to use (now we have two GPUs, each has 16GiB memory)
    no_split_module_classes = ["LlamaDecoderLayer"]
    max_memory = {
        device_id: max_memory_each_gpu for device_id in gpu_device_ids
    }

    config = AutoConfig.from_pretrained(
        checkpoint, 
        trust_remote_code=True
    )

    with init_empty_weights():
        model = AutoModel.from_config(
            config, 
            torch_dtype=torch.float16, 
            trust_remote_code=True
        )
    print(model)
    device_map = infer_auto_device_map(
        model,
        max_memory=max_memory, no_split_module_classes=no_split_module_classes
    )

    print("auto determined device_map", device_map)
    device_map["llm.model.embed_tokens"] = 0
    device_map["llm.model.layers.0"] = 0
    device_map["llm.model.layers.15"] = 1
    device_map["llm.model.layers.16"] = 1
    device_map["llm.model.layers.17"] = 1
    device_map["llm.model.layers.18"] = 1

    device_map["llm.lm_head"] = 0
    device_map["vpm"] = 1
    device_map["resampler"] = 1

    print("modified device_map", device_map)
    print(checkpoint)
    load_checkpoint_in_model(
        model, 
        "/root/.cache/huggingface/hub/models--openbmb--MiniCPM-Llama3-V-2_5/snapshots/21b10cdb728c15a5aa7c616732f049927aab1af3/", 
        device_map=device_map)

    model = dispatch_model(
        model, 
        device_map=device_map
    )

    torch.set_grad_enabled(False)

    return model 

print(model.hf_device_map)
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, device_map="auto")
print(tokenizer.chat_template)
model.eval()

image = Image.open('image_path.jpg').convert('RGB')

question = "extract data from this image receipt, return only json without any additional text or Markdown formatting with the following keys:\n`total_quantity`: total quantity of products in this invoice\n`time`: time of exporting this receipt, in timestamp format\n`type`: category of the merchant\n`total_money`: total amount in this invoice\n`products`: A list of objects containing the following keys (keys in the bracket): (`product_vat`: VAT applied to the product purchased in this invoice, `product_discount_money`: discount amount for the product in this invoice, `product_code`: code of the product purchased in this invoice, `product_discount_retail_money`: retail discount amount for the product, `product_original_price`: original amount to be paid for the product in this invoice (before any discounts or taxes), `product_discount_price`: price after discount applied to the product, `product_discount_wholesale_money`: wholesale discount amount for the product, `product_name`: name of the product purchased in this invoice, `product_total_original_money`: total original amount to be paid for the product in this invoice (before any discounts or taxes), `product_amount`: amount of the product purchased in this invoice, `product_unit_price`: price per unit of the product purchased in this invoice, `product_total_money`: total amount to be paid for the product in this invoice)\n`staff`: name or ID of the staff exporting this invoice, printed in this invoice\n`tax_number`: taxation number displayed on this receipt\n`name`: title of the store exporting the receipt\n`receipt_number`: serial number of this receipt\n`date`: date of exporting this receipt\nReturn blank if you unsure about any information"

msgs = [{'role': 'user', 'content': question}]

res = model.chat(
    image=image,
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=False,
    temperature=0.8,
    repetition_penalty=1.2,
    num_beams=3,
    max_new_tokens=4096
)
print("res", res)

OpenBMB / MiniCPM-V

Why the result produce from the model on demo huggingface different with the model inference in local #245