Closed TzuHanLiang closed 3 weeks ago
import os
import torch
import matplotlib.pyplot as plt
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
# 設置設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 設定參數
data_dir = "/home/twsvhmw558/esg_summerize_data"
model_name = "gpt2"
epochs = 20 # 增加訓練次數
batch_size = 2
learning_rate = 5e-5
# 1. 載入模型和分詞器
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.train()
# 2. 讀取數據
train_texts = []
window_size = 256
stride = 128
for filename in os.listdir(data_dir):
if filename.endswith(".txt"):
print(f"Reading {filename}...")
with open(os.path.join(data_dir, filename), "r", encoding="utf-8") as f:
text = f.read()
paragraphs = text.split('\n\n')
for paragraph in paragraphs:
tokenized_paragraph = tokenizer(paragraph, truncation=False, return_tensors="pt")
input_ids = tokenized_paragraph["input_ids"].squeeze()
for i in range(0, len(input_ids), stride):
end_index = i + window_size
if end_index >= len(input_ids):
end_index = len(input_ids)
window_input_ids = input_ids[i:end_index]
if len(window_input_ids) < window_size:
padding_length = window_size - len(window_input_ids)
window_input_ids = torch.cat([window_input_ids, tokenizer.pad_token_id * torch.ones(padding_length, dtype=torch.long)])
train_texts.append(window_input_ids)
print(f"Total paragraphs tokenized: {len(train_texts)}")
input_ids_list = [input_ids for input_ids in train_texts]
class TextDataset(Dataset):
def __init__(self, input_ids):
self.input_ids = input_ids
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return {"input_ids": self.input_ids[idx]}
dataset = TextDataset(input_ids_list)
print(f"Total samples in dataset: {len(dataset)}")
if len(dataset) == 0:
raise ValueError("The dataset is empty. Please check the data directory and files.")
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_values = []
for epoch in range(epochs):
print(f"Epoch: {epoch+1}/{epochs}")
epoch_loss = 0
for batch in train_loader:
optimizer.zero_grad()
inputs = {key: val.to(device) for key, val in batch.items()}
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"Batch Loss: {loss.item()}")
average_epoch_loss = epoch_loss / len(train_loader)
loss_values.append(average_epoch_loss)
print(f"Epoch: {epoch+1}/{epochs}, Average Loss: {average_epoch_loss}")
model.save_pretrained("/home/twsvhmw558/train_result/saved_model")
tokenizer.save_pretrained("/home/twsvhmw558/train_result/saved_tokenizer")
plt.plot(loss_values)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Time')
plt.savefig("/home/twsvhmw558/train_result/loss_plot.png")
model = GPT2LMHeadModel.from_pretrained("/home/twsvhmw558/train_result/saved_model").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("/home/twsvhmw558/train_result/saved_tokenizer")
model.eval()
input_text = (
"2023年和桐的環境管理狀況如下:\n"
"空汙管理:\n"
"排放源:工廠排放、運輸排放\n"
"排放量:5000噸\n"
"排放強度:每噸產品0.5噸\n"
"減排措施:安裝濾網、使用低排放燃料\n\n"
"能源管理:\n"
"能源消耗:50000 MWh\n"
"可再生能源使用:20%\n"
"能源效率提高措施:更換高效能設備\n\n"
"污水管理:\n"
"污水處理量:30000立方米\n"
"處理技術:生物處理、物理處理\n"
"減少污染措施:使用環保材料\n\n"
"溫室氣體排放:\n"
"總排放量:100000噸\n"
"減排目標:每年減少5%\n"
"實際減排數據:今年減少了5000噸\n\n"
"社會責任:\n"
"勞資權益:員工培訓、性別多元化\n"
"客戶、供應商管理:供應商篩選、客戶滿意度調查\n"
"隱私資料保護:數據保護政策、數據泄露應對措施\n"
"社區關係:社區投資、志願服務\n\n"
"公司治理:\n"
"股東權利:股東大會、股東投票\n"
"商業道德倫理:反貪腐政策、道德行為準則\n"
"供應鏈管理:供應鏈透明度、供應鏈風險管理\n"
"商業行為透明:財務透明度、業務透明度"
)
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
attention_mask = (input_ids != tokenizer.pad_token_id).long().to(device)
with torch.no_grad():
outputs = model.generate(
input_ids,
attention_mask=attention_mask,
max_length=1000,
do_sample=True,
temperature=1.0, # 調整 temperature
top_k=50,
top_p=0.9, # 調整 top_p
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
with open("/home/twsvhmw558/train_result/generated_text.txt", "w", encoding="utf-8") as f:
f.write(generated_text)
2023年和桐的環境管理狀況如下:
空汙管理:
排放源:工廠排放、運輸排放
排放量:5000噸
排放強度:每噸產品0.5噸
減排措施:安裝濾網、使用低排放燃料
能源管理:
能源消耗:50000 MWh
可再生能源使用:20%
能源效率提高措施:更換高效能設備
污水管理:
污水處理量:30000立方米
處理技術:生物處理、物理處理
減少污染措施:使用環保材料
溫室氣體排放:
總排放量:100000噸
減排目標:每年減少5%
實際減排數據:今年減少了5000噸
社會責任:
勞資權益:員工培訓、性別多元化
客戶、供應商管理:供應商篩選、客戶滿意度調查
隱私資料保護:數據保護政策、數據泄露應對措施
社區關係:社區投資、志願服務
公司治理:
股東權利:股東大會、股東投票
商業道德倫理:反貪腐政策、道德行為準則
供應鏈管理:供應鏈透明度、供應鏈風險管理
商業行為透明:財務透明度、業務透明度環境評估,增強供應鏈管理系統審查,確保護和排放綠色產。公司治理方面,並應鏈穩定性和並加強公司積極投綠色產。
took 5hrs
在此容器中安裝 llama,參考資料 Easy Guide to Installing LLaMa 3 by Meta
跟 #167 只有 CPU 比起來加上 GPU 快了4倍
done