opendatalab / magic-doc

Apache License 2.0
305 stars 22 forks source link

It is desirable to preserve the header structure of the parsed md #32

Open 202030481266 opened 3 weeks ago

202030481266 commented 3 weeks ago

Describe the bug A clear and concise description of what the bug is.

I used magic-doc to convert my docx document to markdown, but the headings did not preserve the # heading symbol. How do I preserve the markdown heading structure? 我用了magic-doc转换我的docx文档为markdown,但是其中的标题没有保留 # 标题符号,我该如何保留其中的markdown标题结构呢?

To Reproduce

from magic_doc.docconv import DocConverter, S3Config
converter = DocConverter(s3_config=None)
file_path = '/myfile/path'
markdown_content, time_cost = converter.convert(file_path, conv_timeout=300)

Expected behavior

2.2非临床研究/已完成的临床研究结果概述

2.2.1非临床研究结果

2.2.1.1临床前药效学研究

Screenshots

image

Env

icecraft commented 3 weeks ago

currently, no plan to preserve title now

202030481266 commented 3 weeks ago

I actually found python-docx to work quite well. It not only can identify the title, but also can identify the form, and add tags, so that I can use langchain markdownheadertextspliter for cutting.

import os
from docx import Document

def read_doc(file_path, image_folder):
    doc = Document(file_path)
    content_list = []

    # Ensure image folder exists
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    # To track heading hierarchy and associate tables with titles
    current_heading = None
    current_heading_level = 0

    for para in doc.paragraphs:
        # Handle headings (Title detection)
        if para.style.name.startswith('Heading'):
            heading_level = int(para.style.name.replace('Heading ', '').strip())
            current_heading = para.text.strip()
            current_heading_level = heading_level
            markdown_heading = '#' * heading_level + ' ' + current_heading
            content_list.append({'type': 'heading', 'content': markdown_heading})

        # Handle regular paragraphs
        elif para.text.strip():
            content_list.append({'type': 'paragraph', 'content': para.text.strip()})

    # Handle tables separately since they are not part of paragraphs
    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                # Clean up cell content by removing newlines and extra spaces
                cell_text = " ".join(cell.text.split())
                row_data.append(cell_text)
            table_data.append(row_data)

        # Convert table to Markdown format
        markdown_table = '| ' + ' | '.join(table_data[0]) + ' |\n'
        markdown_table += '| ' + ' | '.join(['---'] * len(table_data[0])) + ' |\n'
        for row in table_data[1:]:
            markdown_table += '| ' + ' | '.join(row) + ' |\n'

        # If there's a current heading, consider it as the table title
        if current_heading:
            content_list.append({'type': 'table', 'title': current_heading, 'content': markdown_table})
        else:
            content_list.append({'type': 'table', 'content': markdown_table})

    # Handle images
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img_bin = rel.target_part.blob
            img_ext = rel.target_part.partname.split(".")[-1]
            filename = f'{image_folder}/image{len(content_list) + 1}.{img_ext}'
            with open(filename, 'wb') as f:
                f.write(img_bin)
            content_list.append({'type': 'image', 'path': filename})

    return content_list
icecraft commented 3 weeks ago

I actually found python-docx to work quite well. It not only can identify the title, but also can identify the form, and add tags, so that I can use langchain markdownheadertextspliter for cutting.

import os
from docx import Document

def read_doc(file_path, image_folder):
    doc = Document(file_path)
    content_list = []

    # Ensure image folder exists
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    # To track heading hierarchy and associate tables with titles
    current_heading = None
    current_heading_level = 0

    for para in doc.paragraphs:
        # Handle headings (Title detection)
        if para.style.name.startswith('Heading'):
            heading_level = int(para.style.name.replace('Heading ', '').strip())
            current_heading = para.text.strip()
            current_heading_level = heading_level
            markdown_heading = '#' * heading_level + ' ' + current_heading
            content_list.append({'type': 'heading', 'content': markdown_heading})

        # Handle regular paragraphs
        elif para.text.strip():
            content_list.append({'type': 'paragraph', 'content': para.text.strip()})

    # Handle tables separately since they are not part of paragraphs
    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                # Clean up cell content by removing newlines and extra spaces
                cell_text = " ".join(cell.text.split())
                row_data.append(cell_text)
            table_data.append(row_data)

        # Convert table to Markdown format
        markdown_table = '| ' + ' | '.join(table_data[0]) + ' |\n'
        markdown_table += '| ' + ' | '.join(['---'] * len(table_data[0])) + ' |\n'
        for row in table_data[1:]:
            markdown_table += '| ' + ' | '.join(row) + ' |\n'

        # If there's a current heading, consider it as the table title
        if current_heading:
            content_list.append({'type': 'table', 'title': current_heading, 'content': markdown_table})
        else:
            content_list.append({'type': 'table', 'content': markdown_table})

    # Handle images
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img_bin = rel.target_part.blob
            img_ext = rel.target_part.partname.split(".")[-1]
            filename = f'{image_folder}/image{len(content_list) + 1}.{img_ext}'
            with open(filename, 'wb') as f:
                f.write(img_bin)
            content_list.append({'type': 'image', 'path': filename})

    return content_list

well done