Closed TzuHanLiang closed 1 month ago
!pip install pymupdf striprtf
import fitz # PyMuPDF
import os
import glob
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text += page.get_text()
return text
def clean_text(text):
# 去除不必要的字符
text = re.sub(r'\n+', '\n', text) # 多個換行符替換為單個換行符
text = re.sub(r'\s+', ' ', text) # 多個空格替換為單個空格
return text
def save_text_to_file(text, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
def process_pdfs(pdf_directory, output_directory):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
for pdf_file in pdf_files:
text = extract_text_from_pdf(pdf_file)
clean_text_data = clean_text(text)
output_file = os.path.join(output_directory, os.path.basename(pdf_file).replace('.pdf', '.txt'))
save_text_to_file(clean_text_data, output_file)
print(f"Processed and saved: {output_file}")
pdf_directory = "/data/pdfs" # 替换为实际的 PDF 文件路径
output_directory = "/data/txts" # 替换为实际的输出文件路径
process_pdfs(pdf_directory, output_directory)
import re
import json
import os
import glob
def annotate_data(text):
"""
标注 ESG 关键数据
"""
annotations = []
patterns = {
'碳排放量': r'(\d{5,}\.\d+)\s*(t-CO2e)',
'能源消耗量': r'能源消耗量為\s*([\d,]+)\s*(千瓦時)',
'員工人數': r'員工\s*([\d,]+)\s*人',
'營業收入': r'營業收入\s*([\d,\.]+)\s*(仟元)',
'稅前淨利': r'稅前淨利\s*([\d,\.]+)\s*(仟元)',
'稅後淨利': r'稅後淨利\s*([\d,\.]+)\s*(仟元)',
'水資源使用': r'水資源使用量為\s*([\d,]+)\s*(立方米)',
'社會貢獻': r'社會貢獻\s*([\d,]+)\s*元'
}
for category, pattern in patterns.items():
for match in re.finditer(pattern, text):
data = match.group(1)
unit = match.group(2) if len(match.groups()) > 1 else ""
start = match.start(1)
end = match.end(1)
annotations.append({'start': start, 'end': end, 'text': data, 'unit': unit, 'category': category})
return annotations
def process_txt_files(txt_directory, output_directory):
"""
处理所有 .txt 文件,标注 ESG 关键数据并分别保存为 JSON
"""
if not os.path.exists(output_directory):
os.makedirs(output_directory)
txt_files = glob.glob(os.path.join(txt_directory, "*.txt"))
for txt_file in txt_files:
with open(txt_file, 'r', encoding='utf-8') as f:
text = f.read()
file_annotations = annotate_data(text)
output_file = os.path.join(output_directory, os.path.basename(txt_file).replace('.txt', '.json'))
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(file_annotations, json_file, ensure_ascii=False, indent=4)
print(f"Annotations saved to: {output_file}")
# 设置 .txt 文件所在目录和输出 JSON 文件目录
txt_directory = "/data/txts" # 替换为实际的 .txt 文件路径
output_directory = "/data/annotations" # 替换为实际的输出 JSON 文件路径
# 处理 .txt 文件并保存标注数据
process_txt_files(txt_directory, output_directory)
took 5hrs done
參考內容:
155