Open 202030481266 opened 3 weeks ago
currently, no plan to preserve title now
I actually found python-docx to work quite well. It not only can identify the title, but also can identify the form, and add tags, so that I can use langchain markdownheadertextspliter for cutting.
import os
from docx import Document
def read_doc(file_path, image_folder):
doc = Document(file_path)
content_list = []
# Ensure image folder exists
if not os.path.exists(image_folder):
os.makedirs(image_folder)
# To track heading hierarchy and associate tables with titles
current_heading = None
current_heading_level = 0
for para in doc.paragraphs:
# Handle headings (Title detection)
if para.style.name.startswith('Heading'):
heading_level = int(para.style.name.replace('Heading ', '').strip())
current_heading = para.text.strip()
current_heading_level = heading_level
markdown_heading = '#' * heading_level + ' ' + current_heading
content_list.append({'type': 'heading', 'content': markdown_heading})
# Handle regular paragraphs
elif para.text.strip():
content_list.append({'type': 'paragraph', 'content': para.text.strip()})
# Handle tables separately since they are not part of paragraphs
for table in doc.tables:
table_data = []
for row in table.rows:
row_data = []
for cell in row.cells:
# Clean up cell content by removing newlines and extra spaces
cell_text = " ".join(cell.text.split())
row_data.append(cell_text)
table_data.append(row_data)
# Convert table to Markdown format
markdown_table = '| ' + ' | '.join(table_data[0]) + ' |\n'
markdown_table += '| ' + ' | '.join(['---'] * len(table_data[0])) + ' |\n'
for row in table_data[1:]:
markdown_table += '| ' + ' | '.join(row) + ' |\n'
# If there's a current heading, consider it as the table title
if current_heading:
content_list.append({'type': 'table', 'title': current_heading, 'content': markdown_table})
else:
content_list.append({'type': 'table', 'content': markdown_table})
# Handle images
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
img_bin = rel.target_part.blob
img_ext = rel.target_part.partname.split(".")[-1]
filename = f'{image_folder}/image{len(content_list) + 1}.{img_ext}'
with open(filename, 'wb') as f:
f.write(img_bin)
content_list.append({'type': 'image', 'path': filename})
return content_list
I actually found python-docx to work quite well. It not only can identify the title, but also can identify the form, and add tags, so that I can use langchain markdownheadertextspliter for cutting.
import os from docx import Document def read_doc(file_path, image_folder): doc = Document(file_path) content_list = [] # Ensure image folder exists if not os.path.exists(image_folder): os.makedirs(image_folder) # To track heading hierarchy and associate tables with titles current_heading = None current_heading_level = 0 for para in doc.paragraphs: # Handle headings (Title detection) if para.style.name.startswith('Heading'): heading_level = int(para.style.name.replace('Heading ', '').strip()) current_heading = para.text.strip() current_heading_level = heading_level markdown_heading = '#' * heading_level + ' ' + current_heading content_list.append({'type': 'heading', 'content': markdown_heading}) # Handle regular paragraphs elif para.text.strip(): content_list.append({'type': 'paragraph', 'content': para.text.strip()}) # Handle tables separately since they are not part of paragraphs for table in doc.tables: table_data = [] for row in table.rows: row_data = [] for cell in row.cells: # Clean up cell content by removing newlines and extra spaces cell_text = " ".join(cell.text.split()) row_data.append(cell_text) table_data.append(row_data) # Convert table to Markdown format markdown_table = '| ' + ' | '.join(table_data[0]) + ' |\n' markdown_table += '| ' + ' | '.join(['---'] * len(table_data[0])) + ' |\n' for row in table_data[1:]: markdown_table += '| ' + ' | '.join(row) + ' |\n' # If there's a current heading, consider it as the table title if current_heading: content_list.append({'type': 'table', 'title': current_heading, 'content': markdown_table}) else: content_list.append({'type': 'table', 'content': markdown_table}) # Handle images for rel in doc.part.rels.values(): if "image" in rel.target_ref: img_bin = rel.target_part.blob img_ext = rel.target_part.partname.split(".")[-1] filename = f'{image_folder}/image{len(content_list) + 1}.{img_ext}' with open(filename, 'wb') as f: f.write(img_bin) content_list.append({'type': 'image', 'path': filename}) return content_list
well done
Describe the bug A clear and concise description of what the bug is.
I used magic-doc to convert my docx document to markdown, but the headings did not preserve the # heading symbol. How do I preserve the markdown heading structure? 我用了magic-doc转换我的docx文档为markdown,但是其中的标题没有保留 # 标题符号,我该如何保留其中的markdown标题结构呢?
To Reproduce
Expected behavior
2.2非临床研究/已完成的临床研究结果概述
2.2.1非临床研究结果
2.2.1.1临床前药效学研究
Screenshots
Env