Closed TzuHanLiang closed 3 weeks ago
took 3hrs
!pip install torch transformers safetensors datasets mlflow tqdm
!pip install --upgrade pip
!yes | pip uninstall protobuf cachetools google-auth
!pip install protobuf==3.20.1 cachetools==5.0.0 google-auth tensorboard google-api-core
import json
import os
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
def load_data(data_dir):
data = []
for filename in os.listdir(data_dir):
if filename.endswith(".json"):
with open(os.path.join(data_dir, filename), 'r') as file:
file_data = json.load(file)
# 過濾掉空列表
file_data = [item for item in file_data if item]
data.extend(file_data)
return data
data_dir = "/data/annotations"
training_data = load_data(data_dir)
# 檢查 training_data 的格式
print("training_data", training_data)
# 初始化 Tokenizer 和模型,設置 num_labels=3
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
# 定義分類標籤字典
label_dict = {
'稅前淨利': 0,
'員工人數': 1,
'稅後淨利': 2
}
# 資料處理函數
def preprocess_data(data):
input_ids = []
attention_masks = []
labels = []
for item in data:
text = item['text']
inputs = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_tensors='pt'
)
input_ids.append(inputs['input_ids'])
attention_masks.append(inputs['attention_mask'])
labels.append(label_dict[item['category']])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
# 分批處理數據以避免內存問題
batch_size = 100 # 根據實際情況調整批次大小
input_ids = []
attention_masks = []
labels = []
for i in range(0, len(training_data), batch_size):
batch_data = training_data[i:i+batch_size]
batch_input_ids, batch_attention_masks, batch_labels = preprocess_data(batch_data)
input_ids.append(batch_input_ids)
attention_masks.append(batch_attention_masks)
labels.append(batch_labels)
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(labels, dim=0)
# 準備 DataLoader
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
# 訓練模型
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
model.train()
for epoch in range(4): # 訓練 4 個 epoch
for batch in dataloader:
input_ids, attention_masks, labels = batch
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
print(f"Epoch: {epoch}, Loss: {loss.item()}")
# 評估模型
def evaluate_model(model, dataloader):
model.eval()
total_loss = 0
for batch in dataloader:
input_ids, attention_masks, labels = batch
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
loss = outputs.loss
total_loss += loss.item()
return total_loss / len(dataloader)
# 使用測試數據集評估模型
test_dataloader = DataLoader(dataset, batch_size=8)
test_loss = evaluate_model(model, test_dataloader)
print(f"Test Loss: {test_loss}")
# 使用模型生成 ESG 報告
def generate_esg_report(model, tokenizer, input_text):
model.eval()
inputs = tokenizer.encode_plus(
input_text,
add_special_tokens=True,
max_length=512,
padding='max_length',
return_tensors='pt'
)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
prediction = torch.argmax(outputs.logits, dim=1).item()
return prediction
# 使用模型生成報告
input_text = "Input company operational data here"
esg_report = generate_esg_report(model, tokenizer, input_text)
print(f"Generated ESG Report: {esg_report}")
esg report Epoch: 0, Loss: 1.0285451412200928
Epoch: 1, Loss: 0.8701068162918091
Epoch: 2, Loss: 0.8251253962516785
Epoch: 3, Loss: 0.7425183057785034
Test Loss: 0.6843301057815552
Generated ESG Report: 1
在每個 epoch 結束後,訓練損失和測試損失的降低表明模型在逐漸學習和改進。 生成的 ESG 報告:
不是預期的結果,這裡的預測結果是分類標籤的預測結果,原因應該是用錯模型,這裡使用的是 Bert 會再嘗試使用 gpt2 來生成連續的文本
took 5hrs done