image_path = '/content/image42.png' # replace with your local image path
image = Image.open(image_path).convert('RGB')
pixel_values = processor.image_processor(
image,
return_tensors="pt",
data_format="channels_first",
).pixel_values
Generate LaTeX expression
with torch.no_grad():
outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_length,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=4,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
sequence = processor.tokenizer.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(
processor.tokenizer.eos_token, ""
).replace(
processor.tokenizer.pad_token, ""
).replace(processor.tokenizer.bostoken,"")
print(sequence)
This is the output for the given image,
\operatorname* { l i m } { x \to \infty } \frac { \frac { d } { d x } \left( e ^ { x } + - 2 \frac { 2 } { x } \right) } { \frac { d } { d x } x ^ { - 2 } }
The output should be $2 x 10^{-3}$
And it gives same output for all other images that I upload. Kindly help.
import torch import requests from PIL import Image from transformers import AutoProcessor, VisionEncoderDecoderModel
Load model & processor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = VisionEncoderDecoderModel.from_pretrained('hoang-quoc-trung/sumen-base').to(device) processor = AutoProcessor.from_pretrained('hoang-quoc-trung/sumen-base') task_prompt = processor.tokenizer.bos_token decoder_input_ids = processor.tokenizer( task_prompt, add_special_tokens=False, return_tensors="pt" ).input_ids
Load image
image_path = '/content/image42.png' # replace with your local image path image = Image.open(image_path).convert('RGB') pixel_values = processor.image_processor( image, return_tensors="pt", data_format="channels_first", ).pixel_values
Generate LaTeX expression
with torch.no_grad(): outputs = model.generate( pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device), max_length=model.decoder.config.max_length, pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=4, bad_words_ids=[[processor.tokenizer.unk_token_id]], return_dict_in_generate=True, ) sequence = processor.tokenizer.batch_decode(outputs.sequences)[0] sequence = sequence.replace( processor.tokenizer.eos_token, "" ).replace( processor.tokenizer.pad_token, "" ).replace(processor.tokenizer.bostoken,"") print(sequence) This is the output for the given image, \operatorname* { l i m } { x \to \infty } \frac { \frac { d } { d x } \left( e ^ { x } + - 2 \frac { 2 } { x } \right) } { \frac { d } { d x } x ^ { - 2 } }
The output should be $2 x 10^{-3}$
And it gives same output for all other images that I upload. Kindly help.