TODOs:
from transformers import AutoModel
import numpy as np
from PIL import Image
import torch
import os
images = [
"path_to_image1.jpg",
"path_to_image2.png",
]
def read_image_as_np_array(image_path):
with open(image_path, "rb") as file:
image = Image.open(file).convert("L").convert("RGB")
image = np.array(image)
return image
images = [read_image_as_np_array(image) for image in images]
model = AutoModel.from_pretrained("ragavsachdeva/magi", trust_remote_code=True).cuda()
with torch.no_grad():
results = model.predict_detections_and_associations(images)
text_bboxes_for_all_images = [x["texts"] for x in results]
ocr_results = model.predict_ocr(images, text_bboxes_for_all_images)
for i in range(len(images)):
model.visualise_single_image_prediction(images[i], results[i], filename=f"image_{i}.png")
model.generate_transcript_for_single_image(results[i], ocr_results[i], filename=f"transcript_{i}.txt")
from PIL import Image
import numpy as np
from transformers import AutoModel
import torch
model = AutoModel.from_pretrained("ragavsachdeva/magiv2", trust_remote_code=True).cuda().eval()
def read_image(path_to_image):
with open(path_to_image, "rb") as file:
image = Image.open(file).convert("L").convert("RGB")
image = np.array(image)
return image
chapter_pages = ["page1.png", "page2.png", "page3.png" ...]
character_bank = {
"images": ["char1.png", "char2.png", "char3.png", "char4.png" ...],
"names": ["Luffy", "Sanji", "Zoro", "Ussop" ...]
}
chapter_pages = [read_image(x) for x in chapter_pages]
character_bank["images"] = [read_image(x) for x in character_bank["images"]]
with torch.no_grad():
per_page_results = model.do_chapter_wide_prediction(chapter_pages, character_bank, use_tqdm=True, do_ocr=True)
transcript = []
for i, (image, page_result) in enumerate(zip(chapter_pages, per_page_results)):
model.visualise_single_image_prediction(image, page_result, f"page_{i}.png")
speaker_name = {
text_idx: page_result["character_names"][char_idx] for text_idx, char_idx in page_result["text_character_associations"]
}
for j in range(len(page_result["ocr"])):
if not page_result["is_essential_text"][j]:
continue
name = speaker_name.get(j, "unsure")
transcript.append(f"<{name}>: {page_result['ocr'][j]}")
with open(f"transcript.txt", "w") as fh:
for line in transcript:
fh.write(line + "\n")
Disclaimer: In adherence to copyright regulations, we are unable to publicly distribute the manga images that we've collected. The test images, however, are available freely, publicly and officially on Manga Plus by Shueisha.
The provided models and datasets are available for academic research purposes only.
@InProceedings{magiv1,
author = {Sachdeva, Ragav and Zisserman, Andrew},
title = {The Manga Whisperer: Automatically Generating Transcriptions for Comics},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2024},
pages = {12967-12976}
}
@misc{magiv2,
author={Ragav Sachdeva and Gyungin Shin and Andrew Zisserman},
title={Tails Tell Tales: Chapter-Wide Manga Transcriptions with Character Names},
year={2024},
eprint={2408.00298},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2408.00298},
}