CAFECA-IO / KnowledgeManagement

Creating, Sharing, Using and Managing the knowledge and information of CAFECA
https://mermer.com.tw/knowledge-management
MIT License
0 stars 1 forks source link

嘗試使用 TWCC 提供的 openAI 訓練一個可以生成 ESG 報告的模型 Part 1: 數據清理及標註 #163

Closed TzuHanLiang closed 1 month ago

TzuHanLiang commented 1 month ago

參考內容:

TzuHanLiang commented 1 month ago
Screenshot 2024-06-07 at 5 00 29 PM

code

Read PDF and clean

!pip install pymupdf striprtf

import fitz  # PyMuPDF
import os
import glob

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def clean_text(text):
    # 去除不必要的字符
    text = re.sub(r'\n+', '\n', text)  # 多個換行符替換為單個換行符
    text = re.sub(r'\s+', ' ', text)   # 多個空格替換為單個空格
    return text

def save_text_to_file(text, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)

def process_pdfs(pdf_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))

    for pdf_file in pdf_files:
        text = extract_text_from_pdf(pdf_file)
        clean_text_data = clean_text(text)

        output_file = os.path.join(output_directory, os.path.basename(pdf_file).replace('.pdf', '.txt'))
        save_text_to_file(clean_text_data, output_file)
        print(f"Processed and saved: {output_file}")

pdf_directory = "/data/pdfs"  # 替换为实际的 PDF 文件路径
output_directory = "/data/txts"  # 替换为实际的输出文件路径

process_pdfs(pdf_directory, output_directory)

annotate

import re
import json
import os
import glob

def annotate_data(text):
    """
    标注 ESG 关键数据
    """
    annotations = []
    patterns = {
        '碳排放量': r'(\d{5,}\.\d+)\s*(t-CO2e)',
        '能源消耗量': r'能源消耗量為\s*([\d,]+)\s*(千瓦時)',
        '員工人數': r'員工\s*([\d,]+)\s*人',
        '營業收入': r'營業收入\s*([\d,\.]+)\s*(仟元)',
        '稅前淨利': r'稅前淨利\s*([\d,\.]+)\s*(仟元)',
        '稅後淨利': r'稅後淨利\s*([\d,\.]+)\s*(仟元)',
        '水資源使用': r'水資源使用量為\s*([\d,]+)\s*(立方米)',
        '社會貢獻': r'社會貢獻\s*([\d,]+)\s*元'
    }

    for category, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            data = match.group(1)
            unit = match.group(2) if len(match.groups()) > 1 else ""
            start = match.start(1)
            end = match.end(1)
            annotations.append({'start': start, 'end': end, 'text': data, 'unit': unit, 'category': category})

    return annotations

def process_txt_files(txt_directory, output_directory):
    """
    处理所有 .txt 文件,标注 ESG 关键数据并分别保存为 JSON
    """
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    txt_files = glob.glob(os.path.join(txt_directory, "*.txt"))

    for txt_file in txt_files:
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()
        file_annotations = annotate_data(text)

        output_file = os.path.join(output_directory, os.path.basename(txt_file).replace('.txt', '.json'))
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(file_annotations, json_file, ensure_ascii=False, indent=4)
            print(f"Annotations saved to: {output_file}")

# 设置 .txt 文件所在目录和输出 JSON 文件目录
txt_directory = "/data/txts"  # 替换为实际的 .txt 文件路径
output_directory = "/data/annotations"  # 替换为实际的输出 JSON 文件路径

# 处理 .txt 文件并保存标注数据
process_txt_files(txt_directory, output_directory)

took 5hrs done