CAFECA-IO / KnowledgeManagement

Creating, Sharing, Using and Managing the knowledge and information of CAFECA
https://mermer.com.tw/knowledge-management
MIT License
0 stars 1 forks source link

嘗試使用 TWCC 提供的 openAI 訓練一個可以生成 ESG 報告的模型 Part 2: 訓練模型 #164

Closed TzuHanLiang closed 3 weeks ago

TzuHanLiang commented 1 month ago
TzuHanLiang commented 1 month ago

took 3hrs

TzuHanLiang commented 3 weeks ago

環境設置

!pip install torch transformers safetensors datasets mlflow tqdm
!pip install --upgrade pip
!yes | pip uninstall protobuf cachetools google-auth
!pip install protobuf==3.20.1 cachetools==5.0.0 google-auth tensorboard google-api-core

訓練模型 code

import json
import os
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset

def load_data(data_dir):
    data = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            with open(os.path.join(data_dir, filename), 'r') as file:
                file_data = json.load(file)
                # 過濾掉空列表
                file_data = [item for item in file_data if item]
                data.extend(file_data)
    return data

data_dir = "/data/annotations"
training_data = load_data(data_dir)

# 檢查 training_data 的格式
print("training_data", training_data)

# 初始化 Tokenizer 和模型,設置 num_labels=3
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# 定義分類標籤字典
label_dict = {
    '稅前淨利': 0,
    '員工人數': 1,
    '稅後淨利': 2
}

# 資料處理函數
def preprocess_data(data):
    input_ids = []
    attention_masks = []
    labels = []
    for item in data:
        text = item['text']
        inputs = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt'
        )
        input_ids.append(inputs['input_ids'])
        attention_masks.append(inputs['attention_mask'])
        labels.append(label_dict[item['category']])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

# 分批處理數據以避免內存問題
batch_size = 100  # 根據實際情況調整批次大小
input_ids = []
attention_masks = []
labels = []

for i in range(0, len(training_data), batch_size):
    batch_data = training_data[i:i+batch_size]
    batch_input_ids, batch_attention_masks, batch_labels = preprocess_data(batch_data)
    input_ids.append(batch_input_ids)
    attention_masks.append(batch_attention_masks)
    labels.append(batch_labels)

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)

# 準備 DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# 訓練模型
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
for epoch in range(4):  # 訓練 4 個 epoch
    for batch in dataloader:
        input_ids, attention_masks, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

# 評估模型
def evaluate_model(model, dataloader):
    model.eval()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_masks, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

# 使用測試數據集評估模型
test_dataloader = DataLoader(dataset, batch_size=8)
test_loss = evaluate_model(model, test_dataloader)
print(f"Test Loss: {test_loss}")

# 使用模型生成 ESG 報告
def generate_esg_report(model, tokenizer, input_text):
    model.eval()
    inputs = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction

# 使用模型生成報告
input_text = "Input company operational data here"
esg_report = generate_esg_report(model, tokenizer, input_text)
print(f"Generated ESG Report: {esg_report}")

訓練結果

esg report Epoch: 0, Loss: 1.0285451412200928
Epoch: 1, Loss: 0.8701068162918091
Epoch: 2, Loss: 0.8251253962516785
Epoch: 3, Loss: 0.7425183057785034
Test Loss: 0.6843301057815552
Generated ESG Report: 1

訓練結果分析

  1. 訓練損失和測試損失趨勢:

在每個 epoch 結束後,訓練損失和測試損失的降低表明模型在逐漸學習和改進。 生成的 ESG 報告:

  1. Generated ESG Report: 1 表示模型對輸入數據的預測結果為標籤 1,即 "員工人數"。

結論

不是預期的結果,這裡的預測結果是分類標籤的預測結果,原因應該是用錯模型,這裡使用的是 Bert 會再嘗試使用 gpt2 來生成連續的文本

took 5hrs done