Closed waaaaaaater closed 1 year ago
可否尝试下1.8.5版本训练下?
不行啊,我试了 pgl在1.8.5中就直接报错了 。好像不兼容
可否确认下softmax_with_cross_entropy中的labels是否有超过范围的异常数据呢?
另外,可否在CPU下训练试下,看能否正常训练?
loss, pred = L.softmax_with_cross_entropy(logits=output, label=labels, return_softmax=True)
静态图中怎么查看softmax_with_cross_entropy的数值是否超过范围了呢? 还没等执行到网络结束就报错了,没办法通过feed_List查看.
CPU上没办法运行。
可否提供下完整的复现代码和脚本,我这边帮忙看下。
没办法上传附件,就只能给您粘贴了。可以直接运行的,只需要起个对应的名字就行了
运行网路部分:
from build_model import ERModel
from build_gatmodel import GATModel as gat
from easydict import EasyDict as edict
import paddle.fluid as F
import paddle.fluid.layers as L
from pre_loadmodel import PretrainedModelLoader # 为了读取参数,可以在之后的内容中进行详细的分析
import pgl
import paddle
import yaml
import numpy as np
from pgl.graph_wrapper import BatchGraphWrapper
import pandas as pd
import re
import os
MAX_SEQUENCE = 64
HIDDEN_SIZE = 32
conf = 'erniesage_link_predict.yaml'
config = edict(yaml.load(open(conf), Loader=yaml.FullLoader))
#config.cls_id = 1
use_cuda = config.use_cuda
import re
import numpy as np
change_flag =1
# from test_file.test_token import DataSet as tokenC
import datetime
class Loader(object):
def __init__(self,config):
pass
def train_batch_reader(self):
start = 0
while start <int(50):
if 1==1:
# 构建了10个节点
this_feat = np.array(np.random.random((10,32)))
# 构建十条边
this_edges = []
for i in range(10):
for j in range(10):
this_edges.append([i,j])
this_feat = np.array(this_feat, dtype="float32")
g = pgl.graph.Graph(
num_nodes= this_feat.shape[0],
edges = this_edges,
node_feat={
'node_feat':this_feat
}
)
# 需要返回的变量是
final_positive = np.array([1,2,4,5,6],dtype= 'int64').reshape(-1,1)
final_label = np.array([1,0,1,1,0],dtype= 'int64').reshape(-1,1)
indegree = g.indegree() # 返回当前图中点的度
# 处理D矩阵
norm = np.maximum(indegree.astype("float32"), 1)
norm = np.power(norm, -0.5)
g.node_feat["norm"] = np.expand_dims(norm, 1) # 添加了度矩阵,就是sqrt(d,-0.5)
start += 10
yield g.num_nodes,g.num_edges,g.edges,g.node_feat['node_feat'],g.node_feat['norm'],final_positive,final_label
only_loader = Loader('')
# 备用代码
train_num_nodes = F.data(name = "num_nodes", shape = [1], dtype='int64')
train_num_edges = F.data(name = "num_edges",shape = [1], dtype = 'int64')
train_edges = F.data(name = "edges", shape = [None, 2],dtype = 'int64')
if config.train_name =="gat":
train_node_feat = F.data(name="term_ids", shape= [None, HIDDEN_SIZE], dtype = 'float32')
else:
train_node_feat = F.data(name="term_ids", shape= [None, 64], dtype = 'int64')
train_inputs = F.data(name="inputs", shape=[None,1], dtype = 'int64')
train_norm = F.data(name = 'norm',shape= [None,1],dtype = 'float32')
train_labels = F.data(name = "fin",shape=[None, 1],dtype="int64")
test_num_nodes = F.data(name = "num_nodes", shape = [1], dtype='int64')
test_num_edges = F.data(name = "num_edges",shape = [1], dtype = 'int64')
test_edges = F.data(name = "edges", shape = [None, 2],dtype = 'int64')
if config.train_name =="gat":
test_node_feat = F.data(name="term_ids", shape= [None, HIDDEN_SIZE], dtype = 'float32')
else:
test_node_feat = F.data(name="term_ids", shape= [None, 64], dtype = 'int64')
test_inputs = F.data(name="inputs", shape=[None,1], dtype = 'int64')
test_norm = F.data(name = 'norm',shape= [None,1],dtype = 'float32')
test_labels = F.data(name = "fin",shape=[None, 1],dtype="int64")
if config.iterable:
# 若DataLoader可迭代,则必须设置places参数
if config.use_data_parallel:
# 若进行多GPU卡训练,则取所有的CUDAPlace
# 若进行多CPU核训练,则取多个CPUPlace,本例中取了8个CPUPlace
places = F.cuda_places() if config.use_cuda else F.cpu_places(8)
else:
# 若进行单GPU卡训练,则取单个CUDAPlace,本例中0代表0号GPU卡
# 若进行单CPU核训练,则取单个CPUPlace,本例中1代表1个CPUPlace
places = F.cuda_places(0) if config.use_cuda else F.cpu_places(1)
else:
places = None
# train & test 要使用不同的program
train_data_loader = F.io.DataLoader.from_generator(feed_list=[train_num_nodes,train_num_edges,train_edges,
train_node_feat,train_norm,train_inputs,train_labels],
capacity=64,
use_double_buffer=True,
iterable=config.iterable)
test_data_loader = F.io.DataLoader.from_generator(feed_list=[test_num_nodes,test_num_edges,test_edges,
test_node_feat,test_norm,test_inputs,test_labels],
capacity=64,
use_double_buffer=True,
iterable=config.iterable)
main_prog = F.Program()
start_prog =F.Program()
# 切换model
if config.train_name =="gat":
ermodel = gat(config,0.1)
else:
ermodel = ERModel(config,0.1)
with F.program_guard(main_prog, start_prog):
with F.unique_name.guard():
train_num_nodes = L.data("num_nodes", [1], False, 'int64')
train_num_edges = L.data("num_edges", [1], False, 'int64')
train_edges = L.data("edges", [-1, 2], False, 'int64')
if config.train_name =="gat":
train_node_feat = L.data("term_ids", [None, HIDDEN_SIZE], False, 'float32')
else:
train_node_feat = L.data("term_ids", [-1, 64], False, 'int64')
inputs = L.data("inputs", [-1], False, 'int64')
norm = L.data('norm',[-1,1],False,'float32')
labels = L.data("fin",
shape=[None, 1],
dtype="int64",
append_batch_size=False)
gw = BatchGraphWrapper(train_num_nodes, train_num_edges, train_edges, {"term_ids": train_node_feat,"norm":norm})
# feature,f2 = ermodel([gw], [inputs],[labels],phase = 'train')
loss,pred,acc = ermodel([gw], [train_inputs],[train_labels])
epoch_step = int((gw.node_feat['term_ids'].shape[0]) / config.batch_size) + 1
boundaries = [
i
for i in range(50 * epoch_step, config.epochs * epoch_step,
epoch_step * 50)
]
values = [config.lr * 0.5 ** i for i in range(0, len(boundaries) + 1)]
lr = L.piecewise_decay(boundaries=boundaries, values=values)
F.optimizer.Adam(lr).minimize(loss) # 想要运行必须换成F.Progeam()才可以
test_prog = F.Program()
test_startup = F.Program()
import os
from visualdl import LogWriter
place= F.CUDAPlace(0)
exe = F.Executor(place)
exe.run(start_prog)
exe.run(test_startup)
best_acc = 0.0
global_step = 0
train_prog = F.CompiledProgram(main_prog).with_data_parallel(loss_name=loss.name)
test_prog_com = F.CompiledProgram(test_prog).with_data_parallel(share_vars_from=train_prog)
places = F.cuda_places() if config.iterable else None
train_data_loader.set_batch_generator(only_loader.train_batch_reader,places= places)
import datetime
def run_iterable(program, exe, this_loss, this_ped,this_acc,data_loader,epoch,target = 'trian' ):
count = 0
ave_acc = []
for data in data_loader():
# pdb.set_trace()
t1 = datetime.datetime.now()
turn_loss,turn_pred,turn_acc = exe.run(program=program, feed=data, fetch_list=[this_loss, this_ped,this_acc])
t2 = datetime.datetime.now()
# print('训练一次的时间为{}'.format(t2-t1))
if count% 10 ==0:
# if 1==1:
print('this turn is {},epoch is {},idx is {},loss is {},acc is {}'.format(target,epoch,count,turn_loss[0],turn_acc[0]))
count += 1
for epoch_id in range(config.epochs):
t1 = datetime.datetime.now()
run_iterable(train_prog, exe, loss,pred,acc, train_data_loader,epoch = epoch_id,target = 'train')
t2 = datetime.datetime.now()
构建网络的部分
'''
构建gatmodel模型内容
'''
import paddle.fluid as F
import paddle.fluid.layers as L
from ernie import ErnieModel
import pgl.layers.conv as conv
import pgl
import numpy as np
from pgl.layers.conv import gin
class GATModel(object):
def __init__(self,config,drop_po):
self.config = config
self.num_tasks = 2
self.dropout_prob = drop_po
self.name = "ermodel"
self.edge_dropout = config.edge_dropout
self.num_layers = config.gat_num_layers
self.feat_droupout = config.feat_droupout
self.attn_drop = config.attn_drop
self.hidden_size = config.gat_hidden_size
def __call__(self, graph_wrappers, inputs,labelsssss,phase = None):
'''gin'''
print('gin')
self.gw = graph_wrappers[0]
self.features_list = [self.gw.node_feat["term_ids"]] # node feature
self.feature_sape = self.features_list[0]
for i in range(self.num_layers):
h = gin(self.gw,
self.features_list[i],
hidden_size=self.hidden_size,
activation="relu",
name="gin_%s" % (i),
init_eps=0.0,
train_eps=False)
h = L.layer_norm(
h,
begin_norm_axis=1,
param_attr=F.ParamAttr(
name="norm_scale_%s" % (i),
initializer=F.initializer.Constant(1.0)),
bias_attr=F.ParamAttr(
name="norm_bias_%s" % (i),
initializer=F.initializer.Constant(0.0)), )
h = L.relu(h)
self.features_list.append(h)
output = 0
for i, h in enumerate(self.features_list):
pooled_h = pgl.layers.graph_pooling(self.gw, h, 'sum')
drop_h = L.dropout(
pooled_h,
self.dropout_prob,
dropout_implementation="upscale_in_train")
drop_h = L.fc(drop_h,
self.config.hidden_size,
bias_attr=False,
param_attr=F.ParamAttr(name=self.name + '_final_weight'))
#[-1, hidden_size ]
final_feats = self.take_final_feature(drop_h, inputs, "v2_final_fc")
output += L.fc(final_feats,
size=self.num_tasks,
act=None,
param_attr=F.ParamAttr(name="final_fc_%s" %
(i)))
# calculate loss
# print(output.shape)
labels = L.data("fin",
shape=[None, 1],
dtype="int64",
append_batch_size=False)
# # 编写后几层的loss和其他网络
loss, pred = L.softmax_with_cross_entropy(logits=output, label=labels, return_softmax=True)
# pred = L.softmax(output)
# loss = L.cross_entropy(pred,labels)
acc = L.accuracy(input=pred, label=labels, k=1)
pred = L.argmax(pred, -1)
loss = L.mean(loss)
return loss,pred,acc#, self.term_ids
def take_final_feature(self, feature, index, name):
"""take final feature"""
feat = L.gather(feature, index, overwrite=False)
feat = linear(feat, self.config.hidden_size, name)
feat = L.l2_normalize(feat, axis=1)
return feat
def get_norm(indegree):
float_degree = L.cast(indegree, dtype="float32")
float_degree = L.clamp(float_degree, min=1.0)
norm = L.pow(float_degree, factor=-0.5)
return norm
def graphsage_sum(feature, gw, hidden_size, name, act):
# copy_send
msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
# sum_recv
neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))
self_feature = linear(feature, hidden_size, name+"_l", act)
neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
output = L.l2_normalize(output, axis=1)
return output
def linear(feat, hidden_size, name, act=None):
return L.fc(feat,
hidden_size,
act=act,
param_attr=F.ParamAttr(name=name + '_w'),
bias_attr=F.ParamAttr(name=name + '_b'))
配置参数
# Global Enviroment Settings
#
# trainer config ------
train_name: 'gat'
task: "node_classification"
batch_size: 4
gatbatch_size: 3400
epochs: 300
# data loader
data_path: 'file/'
use_cuda: True
iterable: True
use_data_parallel: True
# data config ------
graph_data: "./example_data/link_predict/graph_data.txt"
train_data: "./example_data/link_predict/train_data.txt"
graph_work_path: "./workdir"
sample_workers: 1
use_pyreader: true
input_type: "text"
cls_id: 1
lr: 0.01
# gat model setting
edge_dropout: 0.08
gat_num_layers: 2
feat_droupout: 0.09
attn_drop: 0.0
gat_hidden_size: 32
# model config ------
samples: [10]
model_type: "ERNIESageV1"
max_seqlen: 40
num_layers: 1
hidden_size: 128
final_fc: true
final_l2_norm: true
loss_type: "global_hinge"
margin: 0.1
neg_type: "batch_neg"
# infer config ------
#infer_model: "./output/last"
infer_batch_size: 128
# ernie config ------
encoding: "utf8"
# ernie_name: "ernie-1.0"
ernie_name: "ernie-tiny" #tiny is 4x faster than ernie1.0
#ernie_name: "./ernie1.0_model" # 不可连外网时,也可以把ERNIE github上的模型拉取到本地来热启
# runconfig
model_dir: "./output"
max_steps: 10000
save_steps: 100
log_steps: 1
max_ckpt: 1
skip_steps: 0
eval_steps: 10000
eval_max_steps: 1000000
run_steps: 0
# hparam
warmup_proportion: 0.1
weight_decay: 0.01
learning_rate: 0.00005
log_prefix: "training"
请问这个问题解决了吗
此代码运行在aistudio上,环境信息如下: CPU:4 RAM:32G GPU:V100 显存:16GB 磁盘: 100GB
Python版本:python3.7 paddle版本:v.1.80
模型代码为pgl中的gin模型,为官方示例,修改了一部分内容,代码如下:
上述代码在运行的过程中,有的时候可成功运行进行一次,反向传播后,就会报错。有的时候直接报错:
有的时候还会报下面这个错误:
ka 看监控,其实消耗并不是很大