Closed Akshaybhure111 closed 1 week ago
You should use
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
to init image_writer and md_writer
import os from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.pipe.OCRPipe import OCRPipe
model_list = [] pdf_file_name = "/content/sample_data/insulin_pump_pdf.pdf" # Replace with the actual PDF path
output_image_path, output_path = "output/images", "output" os.makedirs(output_image_path, exist_ok=True)
image_writer = DiskReaderWriter(output_image_path) md_writer = DiskReaderWriter(output_path)
reader_writer = DiskReaderWriter("") pdf_bytes = reader_writer.read(pdf_file_name)
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify() pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown( output_image_path, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD )
if isinstance(md_content, list): md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) else: md_writer.write_string(f"{pdf_file_name}.md", md_content)
import os from magic_pdf.data.data_reader_writer.filebase import FileBasedDataReader, FileBasedDataWriter from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.pipe.OCRPipe import OCRPipe
model_list = [] pdf_file_name = r"/content/sample_data/insulin_pump_pdf.pdf" # Replace with the actual PDF path
local_image_dir, local_md_dir = "output/images", "output" os.makedirs(local_image_dir, exist_ok=True)
image_writer = FileBasedDataWriter(local_image_dir) md_writer = FileBasedDataWriter(local_md_dir)
reader1 = FileBasedDataReader("") pdf_bytes = reader1.read(pdf_file_name)
pipe = OCRPipe(pdf_bytes, model_list,image_writer)
pipe.pipe_classify() pipe.pipe_analyze()
pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown( image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD )
if isinstance(md_content, list): md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) else: md_writer.write_string(f"{pdf_file_name}.md", md_content)
can you please in my above provided code what changes need to do and please provide me whole code scripts so I can run successfully
Description of the bug | 错误描述
Main Code
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.pipe.OCRPipe import OCRPipe
args
model_list = [] pdf_file_name = r"/content/sample_data/insulin_pump_pdf.pdf" # replace with the real pdf path
prepare env
local_image_dir, local_md_dir = "output/images", "output" os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( local_md_dir ) # create 00 image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("") pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify() pipe.pipe_analyze() pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown( image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD )
if isinstance(md_content, list): md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) else: md_writer.write_string(f"{pdf_file_name}.md", md_content) When I am Running above script below error is getting
6 frames /usr/local/lib/python3.10/dist-packages/magic_pdf/libs/pdf_image_tools.py in cut_image(bbox, page_num, page, return_path, imageWriter) 29 byte_data = pix.tobytes(output='jpeg', jpg_quality=95) 30 ---> 31 imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN) 32 33 return img_hash256_path
TypeError: FileBasedDataWriter.write() takes 3 positional arguments but 4 were given I have checked in your method there are 2 arguments were passed in FileBasedDataWriter but 31 imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN) here AbsReaderWriter.MODE_BIN this its saying extra argument please check it and resolve
How to reproduce the bug | 如何复现
Main Code
import os
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode from magic_pdf.pipe.OCRPipe import OCRPipe
args
model_list = [] pdf_file_name = r"/content/sample_data/insulin_pump_pdf.pdf" # replace with the real pdf path
prepare env
local_image_dir, local_md_dir = "output/images", "output" os.makedirs(local_image_dir, exist_ok=True)
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter( local_md_dir ) # create 00 image_dir = str(os.path.basename(local_image_dir))
reader1 = FileBasedDataReader("") pdf_bytes = reader1.read(pdf_file_name) # read the pdf content
pipe = OCRPipe(pdf_bytes, model_list, image_writer)
pipe.pipe_classify() pipe.pipe_analyze() pipe.pipe_parse()
pdf_info = pipe.pdf_mid_data["pdf_info"]
md_content = pipe.pipe_mk_markdown( image_dir, drop_mode=DropMode.NONE, md_make_mode=MakeMode.MM_MD )
if isinstance(md_content, list): md_writer.write_string(f"{pdf_file_name}.md", "\n".join(md_content)) else: md_writer.write_string(f"{pdf_file_name}.md", md_content) When I am Running above script below error is getting
6 frames /usr/local/lib/python3.10/dist-packages/magic_pdf/libs/pdf_image_tools.py in cut_image(bbox, page_num, page, return_path, imageWriter) 29 byte_data = pix.tobytes(output='jpeg', jpg_quality=95) 30 ---> 31 imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN) 32 33 return img_hash256_path
TypeError: FileBasedDataWriter.write() takes 3 positional arguments but 4 were given I have checked in your method there are 2 arguments were passed in FileBasedDataWriter but 31 imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN) here AbsReaderWriter.MODE_BIN this its saying extra argument please check it and resolve
Operating system | 操作系统
Linux
Python version | Python 版本
3.10
Software version | 软件版本 (magic-pdf --version)
0.9.x
Device mode | 设备模式
cuda