PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.06k stars 5.54k forks source link

Cuda error(77), an illegal memory access was encountered #30513

Closed waaaaaaater closed 1 year ago

waaaaaaater commented 3 years ago

此代码运行在aistudio上,环境信息如下: CPU:4 RAM:32G GPU:V100 显存:16GB 磁盘: 100GB

Python版本:python3.7 paddle版本:v.1.80

模型代码为pgl中的gin模型,为官方示例,修改了一部分内容,代码如下:

 def __call__(self, graph_wrappers, inputs,labelsssss,phase = None):
        '''gin'''
        self.gw = graph_wrappers[0]
        self.features_list = [self.gw.node_feat["term_ids"]] # node feature

        self.feature_sape = self.features_list[0]

        for i in range(self.num_layers):
         self.feature_list.append([features_list[i].shape[0],features_list[i].shape[1]])

            h = gin(self.gw,
                    self.features_list[i],
                    hidden_size=self.hidden_size,
                    activation="relu",
                    name="gin_%s" % (i),
                    init_eps=0.0,
                    train_eps='store_true')
            h = L.layer_norm(
                h,
                begin_norm_axis=1,
                param_attr=F.ParamAttr(
                    name="norm_scale_%s" % (i),
                    initializer=F.initializer.Constant(1.0)),
                bias_attr=F.ParamAttr(
                    name="norm_bias_%s" % (i),
                    initializer=F.initializer.Constant(0.0)), )

            h = L.relu(h)

            self.features_list.append(h)

        output = 0
        for i, h in enumerate(self.features_list):
            pooled_h = pgl.layers.graph_pooling(self.gw, h, 'sum')

            drop_h = L.dropout(
                pooled_h,
                self.dropout_prob,
                dropout_implementation="upscale_in_train")

            drop_h = L.fc(drop_h,
                                self.config.hidden_size,
                                 bias_attr=False,
                                 param_attr=F.ParamAttr(name=self.name + '_final_weight'))
        #[-1, hidden_size ]

            final_feats = self.take_final_feature(drop_h, inputs, "v2_final_fc")

            output += L.fc(final_feats,
                            size=self.num_tasks,
                            act=None,
                            param_attr=F.ParamAttr(name="final_fc_%s" %
                                                       (i)))

        labels = L.data("fin",
            shape=[None, 1],
            dtype="int64",
            append_batch_size=False)

        loss, pred = L.softmax_with_cross_entropy(logits=output, label=labels, return_softmax=True)
        # pred = L.softmax(output)
        # loss = L.cross_entropy(pred,labels)  
        acc = L.accuracy(input=pred, label=labels, k=1)
        pred = L.argmax(pred, -1)
        loss = L.mean(loss)

        return loss,pred,acc#, self.term_ids

def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat

def get_norm(indegree):
    float_degree = L.cast(indegree, dtype="float32")
    float_degree = L.clamp(float_degree, min=1.0)
    norm = L.pow(float_degree, factor=-0.5)
    return norm

def graphsage_sum(feature, gw, hidden_size, name, act):
    # copy_send
    msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
    # sum_recv
    neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))

    self_feature = linear(feature, hidden_size, name+"_l", act)
    neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
    output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
    output = L.l2_normalize(output, axis=1)
    return output
def linear(feat, hidden_size, name, act=None):
    return L.fc(feat,
                hidden_size,
                act=act,
                param_attr=F.ParamAttr(name=name + '_w'),
                bias_attr=F.ParamAttr(name=name + '_b'))

上述代码在运行的过程中,有的时候可成功运行进行一次,反向传播后,就会报错。有的时候直接报错:


DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  from sample_data.data_prepare import DataSet  as  tokenC,ds
W0117 22:07:31.984557  2943 device_context.cc:252] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W0117 22:07:31.989146  2943 device_context.cc:260] device: 0, cuDNN Version: 7.6.
2021-01-17 22:07:35,315-INFO: get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz
run.py:357: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  only_loader = Loader(config)
gin
gin
this turn is train,epoch is 0,idx is 0,loss is 0.7292255163192749,acc is 0.5094936490058899
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py:1070: UserWarning: The following exception is not an EOF exception.
  "The following exception is not an EOF exception.")
Traceback (most recent call last):
  File "run.py", line 568, in <module>
    run_iterable(train_prog, exe, loss,pred,acc, train_data_loader,epoch = epoch_id,target = 'train')
  File "run.py", line 529, in run_iterable
    turn_loss,turn_pred,turn_acc = exe.run(program=program, feed=data, fetch_list=[this_loss, this_ped,this_acc])
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py", line 1071, in run
    six.reraise(*sys.exc_info())
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/six.py", line 703, in reraise
    raise value
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py", line 1066, in run
    return_merged=return_merged)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py", line 1167, in _run_impl
    return_merged=return_merged)
  File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/paddle/fluid/executor.py", line 879, in _run_parallel
    tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
paddle.fluid.core_avx.EnforceNotMet: 

--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0   std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1   paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2   paddle::platform::stream::CUDAStream::Wait() const
3   paddle::platform::CUDADeviceContext::Wait() const
4   paddle::framework::details::ScopeBufferedMonitor::Apply(std::function<void ()> const&, bool)
5   paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, bool)
6   paddle::framework::ParallelExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, bool)

----------------------
Error Message Summary:
----------------------
ExternalError:  Cuda error(77), an illegal memory access was encountered.
  [Advise: The device encountered a load or store instruction on an invalid memory address. This leaves the process in an inconsistentstate and any further CUDA work will return the same error. To continue using CUDA, the process must be terminated and relaunched. ] at (/paddle/paddle/fluid/platform/stream/cuda_stream.cc:65)

有的时候还会报下面这个错误:

get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz
run.py:30: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  from sample_data.data_prepare import DataSet  as  tokenC,ds
W0117 22:12:35.980592  3502 device_context.cc:252] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
W0117 22:12:35.984983  3502 device_context.cc:260] device: 0, cuDNN Version: 7.6.
2021-01-17 22:12:39,325-INFO: get pretrain dir from https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz
run.py:357: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  only_loader = Loader(config)
gin
gin
terminate called after throwing an instance of 'paddle::platform::EnforceNotMet'
  what():  

--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0   std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1   paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2   paddle::framework::details::OpHandleBase::~OpHandleBase()
3   paddle::framework::details::FetchOpHandle::~FetchOpHandle()
4   paddle::framework::ir::Node::~Node()
5   paddle::framework::ir::Node::~Node()
6   paddle::framework::details::ClearFetchOp(paddle::framework::ir::Graph*, std::vector<paddle::framework::details::OpHandleBase*, std::allocator<paddle::framework::details::OpHandleBase*> >*)
7   paddle::framework::details::FastThreadedSSAGraphExecutor::ExecutionFinal(std::vector<paddle::framework::details::OpHandleBase*, std::allocator<paddle::framework::details::OpHandleBase*> >*)
8   paddle::framework::details::FastThreadedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, bool)
9   paddle::framework::details::ScopeBufferedMonitor::Apply(std::function<void ()> const&, bool)
10  paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, bool)
11  paddle::framework::ParallelExecutor::Run(std::vector<std::string, std::allocator<std::string> > const&, bool)

----------------------
Error Message Summary:
----------------------
Error: An error occurred here. There is no accurate error hint for this error yet. We are continuously in the process of increasing hint for this kind of error check. It would be helpful if you could inform us of how this conversion went by opening a github issue. And we will resolve it with high priority.
  - New issue link: https://github.com/PaddlePaddle/Paddle/issues/new
  - Recommended issue content: all error stack information at (/paddle/paddle/fluid/framework/details/op_handle_base.cc:39)

W0117 22:12:43.881812  3502 init.cc:216] Warning: PaddlePaddle catches a failure signal, it may not work properly
W0117 22:12:43.881850  3502 init.cc:218] You could check whether you killed PaddlePaddle thread/process accidentally or report the case to PaddlePaddle
W0117 22:12:43.881852  3502 init.cc:221] The detail failure signal is:

W0117 22:12:43.881858  3502 init.cc:224] *** Aborted at 1610892763 (unix time) try "date -d @1610892763" if you are using GNU date ***
W0117 22:12:43.883653  3502 init.cc:224] PC: @                0x0 (unknown)
W0117 22:12:43.883749  3502 init.cc:224] *** SIGABRT (@0x3e800000dae) received by PID 3502 (TID 0x7fbaab9fa700) from PID 3502; stack trace: ***
W0117 22:12:43.885046  3502 init.cc:224]     @     0x7fbaab5e5390 (unknown)
W0117 22:12:43.886260  3502 init.cc:224]     @     0x7fbaab23f428 gsignal
W0117 22:12:43.887430  3502 init.cc:224]     @     0x7fbaab24102a abort
W0117 22:12:43.888377  3502 init.cc:224]     @     0x7fba6bfb384a __gnu_cxx::__verbose_terminate_handler()
W0117 22:12:43.889358  3502 init.cc:224]     @     0x7fba6bfb1f47 __cxxabiv1::__terminate()
W0117 22:12:43.890491  3502 init.cc:224]     @     0x7fba6bfb13a5 __cxa_call_terminate
W0117 22:12:43.891356  3502 init.cc:224]     @     0x7fba6bfb1bd8 __gxx_personality_v0
W0117 22:12:43.892112  3502 init.cc:224]     @     0x7fba6c2a4aab _Unwind_RaiseException_Phase2
W0117 22:12:43.892907  3502 init.cc:224]     @     0x7fba6c2a4f49 _Unwind_Resume
W0117 22:12:43.896039  3502 init.cc:224]     @     0x7fba3663c6c8 paddle::framework::details::OpHandleBase::~OpHandleBase()
W0117 22:12:43.898217  3502 init.cc:224]     @     0x7fba365bede1 paddle::framework::details::FetchOpHandle::~FetchOpHandle()
W0117 22:12:43.900923  3502 init.cc:224]     @     0x7fba3351cae9 paddle::framework::ir::Node::~Node()
W0117 22:12:43.904723  3502 init.cc:224]     @     0x7fba3351cc91 paddle::framework::ir::Node::~Node()
W0117 22:12:43.906497  3502 init.cc:224]     @     0x7fba365c2076 paddle::framework::details::ClearFetchOp()
W0117 22:12:43.907918  3502 init.cc:224]     @     0x7fba365bd27a paddle::framework::details::FastThreadedSSAGraphExecutor::ExecutionFinal()
W0117 22:12:43.910740  3502 init.cc:224]     @     0x7fba365bc1d2 paddle::framework::details::FastThreadedSSAGraphExecutor::Run()
W0117 22:12:43.911839  3502 init.cc:224]     @     0x7fba36513bdc _ZZN6paddle9framework7details29ScopeBufferedSSAGraphExecutor3RunERKSt6vectorISsSaISsEEbENKUlvE_clEv
W0117 22:12:43.915313  3502 init.cc:224]     @     0x7fba365180bf paddle::framework::details::ScopeBufferedMonitor::Apply()
W0117 22:12:43.919101  3502 init.cc:224]     @     0x7fba36514814 paddle::framework::details::ScopeBufferedSSAGraphExecutor::Run()
W0117 22:12:43.921908  3502 init.cc:224]     @     0x7fba3362f95d paddle::framework::ParallelExecutor::Run()
W0117 22:12:43.922355  3502 init.cc:224]     @     0x7fba332abcc7 _ZZN8pybind1112cpp_function10initializeIZN6paddle6pybindL22pybind11_init_core_avxERNS_6moduleEEUlRNS2_9framework16ParallelExecutorERKSt6vectorISsSaISsEEbE210_NS_6objectEIS8_SD_bEINS_4nameENS_9is_methodENS_7siblingEEEEvOT_PFT0_DpT1_EDpRKT2_ENUlRNS_6detail13function_callEE1_4_FUNESW_
W0117 22:12:43.923537  3502 init.cc:224]     @     0x7fba332f8139 pybind11::cpp_function::dispatcher()
W0117 22:12:43.923882  3502 init.cc:224]     @     0x5651b4dfa744 _PyMethodDef_RawFastCallKeywords
W0117 22:12:43.924118  3502 init.cc:224]     @     0x5651b4dfa861 _PyCFunction_FastCallKeywords
W0117 22:12:43.924343  3502 init.cc:224]     @     0x5651b4e666e8 _PyEval_EvalFrameDefault
W0117 22:12:43.924566  3502 init.cc:224]     @     0x5651b4daa539 _PyEval_EvalCodeWithName
W0117 22:12:43.924804  3502 init.cc:224]     @     0x5651b4df9f57 _PyFunction_FastCallKeywords
W0117 22:12:43.925050  3502 init.cc:224]     @     0x5651b4e628cc _PyEval_EvalFrameDefault
W0117 22:12:43.925257  3502 init.cc:224]     @     0x5651b4daa539 _PyEval_EvalCodeWithName
W0117 22:12:43.925449  3502 init.cc:224]     @     0x5651b4df9f57 _PyFunction_FastCallKeywords
W0117 22:12:43.925684  3502 init.cc:224]     @     0x5651b4e628cc _PyEval_EvalFrameDefault
W0117 22:12:43.925891  3502 init.cc:224]     @     0x5651b4daa539 _PyEval_EvalCodeWithName
Aborted (core dumped)

image ka 看监控,其实消耗并不是很大

Aurelius84 commented 3 years ago

可否尝试下1.8.5版本训练下?

waaaaaaater commented 3 years ago

不行啊,我试了 pgl在1.8.5中就直接报错了 。好像不兼容

Aurelius84 commented 3 years ago

可否确认下softmax_with_cross_entropy中的labels是否有超过范围的异常数据呢?

另外,可否在CPU下训练试下,看能否正常训练?

waaaaaaater commented 3 years ago
loss, pred = L.softmax_with_cross_entropy(logits=output, label=labels, return_softmax=True)

静态图中怎么查看softmax_with_cross_entropy的数值是否超过范围了呢? 还没等执行到网络结束就报错了,没办法通过feed_List查看.

CPU上没办法运行。

Aurelius84 commented 3 years ago

可否提供下完整的复现代码和脚本,我这边帮忙看下。

waaaaaaater commented 3 years ago

没办法上传附件,就只能给您粘贴了。可以直接运行的,只需要起个对应的名字就行了

运行网路部分:


from  build_model import ERModel
from build_gatmodel import GATModel as gat
from easydict import EasyDict as edict
import paddle.fluid as F
import paddle.fluid.layers as L
from pre_loadmodel import PretrainedModelLoader # 为了读取参数,可以在之后的内容中进行详细的分析
import pgl
import paddle
import yaml
import numpy as np
from pgl.graph_wrapper import BatchGraphWrapper
import pandas  as pd 
import  re
import  os 

MAX_SEQUENCE = 64
HIDDEN_SIZE = 32

conf = 'erniesage_link_predict.yaml'
config = edict(yaml.load(open(conf), Loader=yaml.FullLoader))
#config.cls_id = 1
use_cuda = config.use_cuda
import re

import  numpy  as np
change_flag =1
# from test_file.test_token import DataSet  as  tokenC
import datetime   
class Loader(object):
    def __init__(self,config):
        pass
    def  train_batch_reader(self):
        start = 0 
        while start <int(50):
            if 1==1:
                # 构建了10个节点
                this_feat = np.array(np.random.random((10,32)))
                # 构建十条边
                this_edges = []
                for i in range(10):
                    for  j in range(10):
                        this_edges.append([i,j])

                this_feat = np.array(this_feat, dtype="float32")
                g = pgl.graph.Graph(
                    num_nodes=  this_feat.shape[0],
                    edges = this_edges,
                    node_feat={
                        'node_feat':this_feat
                    }
                )
                # 需要返回的变量是
                final_positive = np.array([1,2,4,5,6],dtype= 'int64').reshape(-1,1)
                final_label = np.array([1,0,1,1,0],dtype= 'int64').reshape(-1,1)
                indegree = g.indegree()  # 返回当前图中点的度
                # 处理D矩阵
                norm = np.maximum(indegree.astype("float32"), 1)
                norm = np.power(norm, -0.5)
                g.node_feat["norm"] = np.expand_dims(norm, 1)  # 添加了度矩阵,就是sqrt(d,-0.5)
            start += 10
            yield g.num_nodes,g.num_edges,g.edges,g.node_feat['node_feat'],g.node_feat['norm'],final_positive,final_label

only_loader = Loader('')

# 备用代码
train_num_nodes = F.data(name = "num_nodes", shape = [1],  dtype='int64')
train_num_edges = F.data(name = "num_edges",shape =  [1],  dtype = 'int64')
train_edges = F.data(name = "edges", shape = [None, 2],dtype =  'int64')
if config.train_name =="gat":
    train_node_feat = F.data(name="term_ids", shape= [None, HIDDEN_SIZE], dtype =  'float32')
else:
    train_node_feat = F.data(name="term_ids", shape= [None, 64], dtype =  'int64')

train_inputs = F.data(name="inputs", shape=[None,1],  dtype = 'int64')
train_norm = F.data(name = 'norm',shape= [None,1],dtype = 'float32')
train_labels = F.data(name = "fin",shape=[None, 1],dtype="int64")

test_num_nodes = F.data(name = "num_nodes", shape = [1],  dtype='int64')
test_num_edges = F.data(name = "num_edges",shape =  [1],  dtype = 'int64')
test_edges = F.data(name = "edges", shape = [None, 2],dtype =  'int64')
if config.train_name =="gat":
    test_node_feat = F.data(name="term_ids", shape= [None, HIDDEN_SIZE], dtype =  'float32')
else:
    test_node_feat = F.data(name="term_ids", shape= [None, 64], dtype =  'int64')

test_inputs = F.data(name="inputs", shape=[None,1],  dtype = 'int64')
test_norm = F.data(name = 'norm',shape= [None,1],dtype = 'float32')
test_labels = F.data(name = "fin",shape=[None, 1],dtype="int64")

if config.iterable:
    # 若DataLoader可迭代,则必须设置places参数
    if config.use_data_parallel:
        # 若进行多GPU卡训练,则取所有的CUDAPlace
        # 若进行多CPU核训练,则取多个CPUPlace,本例中取了8个CPUPlace
        places = F.cuda_places() if config.use_cuda else F.cpu_places(8)
    else:
        # 若进行单GPU卡训练,则取单个CUDAPlace,本例中0代表0号GPU卡
        # 若进行单CPU核训练,则取单个CPUPlace,本例中1代表1个CPUPlace
        places = F.cuda_places(0) if config.use_cuda else F.cpu_places(1)
else:

    places = None

# train & test  要使用不同的program
train_data_loader = F.io.DataLoader.from_generator(feed_list=[train_num_nodes,train_num_edges,train_edges,
                                                        train_node_feat,train_norm,train_inputs,train_labels],
                                             capacity=64,
                                             use_double_buffer=True,
                                             iterable=config.iterable)
test_data_loader = F.io.DataLoader.from_generator(feed_list=[test_num_nodes,test_num_edges,test_edges,
                                                        test_node_feat,test_norm,test_inputs,test_labels],
                                             capacity=64,
                                             use_double_buffer=True,
                                             iterable=config.iterable)

main_prog =  F.Program()
start_prog =F.Program()
# 切换model
if config.train_name =="gat":
    ermodel = gat(config,0.1)
else:
    ermodel = ERModel(config,0.1)
with F.program_guard(main_prog, start_prog):
    with F.unique_name.guard():

        train_num_nodes = L.data("num_nodes", [1], False, 'int64')
        train_num_edges = L.data("num_edges", [1], False, 'int64')
        train_edges = L.data("edges", [-1, 2], False, 'int64')
        if config.train_name =="gat":
            train_node_feat = L.data("term_ids", [None, HIDDEN_SIZE], False, 'float32')
        else:
            train_node_feat = L.data("term_ids", [-1, 64], False, 'int64')

        inputs = L.data("inputs", [-1], False, 'int64')
        norm = L.data('norm',[-1,1],False,'float32')
        labels = L.data("fin",
            shape=[None, 1],
            dtype="int64",
            append_batch_size=False)

        gw = BatchGraphWrapper(train_num_nodes, train_num_edges, train_edges, {"term_ids": train_node_feat,"norm":norm})
        # feature,f2 = ermodel([gw], [inputs],[labels],phase = 'train')

        loss,pred,acc = ermodel([gw], [train_inputs],[train_labels])
        epoch_step = int((gw.node_feat['term_ids'].shape[0]) / config.batch_size) + 1
        boundaries = [
            i
            for i in range(50 * epoch_step, config.epochs * epoch_step,
                        epoch_step * 50)
        ]
        values = [config.lr * 0.5 ** i for i in range(0, len(boundaries) + 1)]
        lr = L.piecewise_decay(boundaries=boundaries, values=values)
        F.optimizer.Adam(lr).minimize(loss) # 想要运行必须换成F.Progeam()才可以
test_prog = F.Program()
test_startup = F.Program()

import os
from visualdl import LogWriter
place= F.CUDAPlace(0)
exe = F.Executor(place)
exe.run(start_prog)
exe.run(test_startup)
best_acc = 0.0
global_step = 0

train_prog = F.CompiledProgram(main_prog).with_data_parallel(loss_name=loss.name)
test_prog_com = F.CompiledProgram(test_prog).with_data_parallel(share_vars_from=train_prog)

places = F.cuda_places() if config.iterable else None

train_data_loader.set_batch_generator(only_loader.train_batch_reader,places= places)

import datetime 
def run_iterable(program, exe, this_loss, this_ped,this_acc,data_loader,epoch,target = 'trian' ):

    count = 0
    ave_acc = []
    for data in data_loader():
        # pdb.set_trace()
        t1 = datetime.datetime.now()
        turn_loss,turn_pred,turn_acc = exe.run(program=program, feed=data, fetch_list=[this_loss, this_ped,this_acc])
        t2 = datetime.datetime.now()
        # print('训练一次的时间为{}'.format(t2-t1))
        if count% 10 ==0:
        # if 1==1:
            print('this turn is {},epoch is {},idx is {},loss is {},acc is {}'.format(target,epoch,count,turn_loss[0],turn_acc[0]))
        count += 1

for epoch_id in range(config.epochs):
    t1 = datetime.datetime.now()
    run_iterable(train_prog, exe, loss,pred,acc, train_data_loader,epoch = epoch_id,target = 'train')
    t2 = datetime.datetime.now()

构建网络的部分

'''
构建gatmodel模型内容
'''

import paddle.fluid as F
import paddle.fluid.layers as L
from ernie import ErnieModel
import pgl.layers.conv as conv
import pgl
import  numpy as np
from pgl.layers.conv import  gin
class   GATModel(object):
    def __init__(self,config,drop_po):
        self.config = config
        self.num_tasks = 2
        self.dropout_prob = drop_po
        self.name = "ermodel"
        self.edge_dropout = config.edge_dropout
        self.num_layers = config.gat_num_layers
        self.feat_droupout = config.feat_droupout
        self.attn_drop  = config.attn_drop 
        self.hidden_size = config.gat_hidden_size

    def __call__(self, graph_wrappers, inputs,labelsssss,phase = None):
        '''gin'''
        print('gin')
        self.gw = graph_wrappers[0]
        self.features_list = [self.gw.node_feat["term_ids"]] # node feature

        self.feature_sape = self.features_list[0]

        for i in range(self.num_layers):

            h = gin(self.gw,
                    self.features_list[i],
                    hidden_size=self.hidden_size,
                    activation="relu",
                    name="gin_%s" % (i),
                    init_eps=0.0,
                    train_eps=False)
            h = L.layer_norm(
                h,
                begin_norm_axis=1,
                param_attr=F.ParamAttr(
                    name="norm_scale_%s" % (i),
                    initializer=F.initializer.Constant(1.0)),
                bias_attr=F.ParamAttr(
                    name="norm_bias_%s" % (i),
                    initializer=F.initializer.Constant(0.0)), )

            h = L.relu(h)

            self.features_list.append(h)

        output = 0
        for i, h in enumerate(self.features_list):
            pooled_h = pgl.layers.graph_pooling(self.gw, h, 'sum')

            drop_h = L.dropout(
                pooled_h,
                self.dropout_prob,
                dropout_implementation="upscale_in_train")

            drop_h = L.fc(drop_h,
                                self.config.hidden_size,
                                 bias_attr=False,
                                 param_attr=F.ParamAttr(name=self.name + '_final_weight'))
        #[-1, hidden_size ]

            final_feats = self.take_final_feature(drop_h, inputs, "v2_final_fc")

            output += L.fc(final_feats,
                            size=self.num_tasks,
                            act=None,
                            param_attr=F.ParamAttr(name="final_fc_%s" %
                                                       (i)))

        # calculate loss

        # print(output.shape)

        labels = L.data("fin",
            shape=[None, 1],
            dtype="int64",
            append_batch_size=False)
        # # 编写后几层的loss和其他网络
        loss, pred = L.softmax_with_cross_entropy(logits=output, label=labels, return_softmax=True)
        # pred = L.softmax(output)
        # loss = L.cross_entropy(pred,labels)  
        acc = L.accuracy(input=pred, label=labels, k=1)
        pred = L.argmax(pred, -1)
        loss = L.mean(loss)

        return loss,pred,acc#, self.term_ids

    def take_final_feature(self, feature, index, name):
        """take final feature"""
        feat = L.gather(feature, index, overwrite=False)
        feat = linear(feat, self.config.hidden_size, name)
        feat = L.l2_normalize(feat, axis=1)
        return feat

def get_norm(indegree):
    float_degree = L.cast(indegree, dtype="float32")
    float_degree = L.clamp(float_degree, min=1.0)
    norm = L.pow(float_degree, factor=-0.5)
    return norm

def graphsage_sum(feature, gw, hidden_size, name, act):
    # copy_send
    msg = gw.send(lambda src, dst, edge: src["h"], nfeat_list=[("h", feature)])
    # sum_recv
    neigh_feature = gw.recv(msg, lambda feat: L.sequence_pool(feat, pool_type="sum"))

    self_feature = linear(feature, hidden_size, name+"_l", act)
    neigh_feature = linear(neigh_feature, hidden_size, name+"_r", act)
    output = L.concat([self_feature, neigh_feature], axis=1) # [B, 2H]
    output = L.l2_normalize(output, axis=1)
    return output
def linear(feat, hidden_size, name, act=None):
    return L.fc(feat,
                hidden_size,
                act=act,
                param_attr=F.ParamAttr(name=name + '_w'),
                bias_attr=F.ParamAttr(name=name + '_b'))

配置参数

# Global Enviroment Settings 
#
# trainer config ------
train_name: 'gat'
task: "node_classification"
batch_size: 4
gatbatch_size: 3400

epochs: 300
# data loader
data_path: 'file/'
use_cuda: True
iterable: True
use_data_parallel: True
# data config ------
graph_data: "./example_data/link_predict/graph_data.txt"
train_data: "./example_data/link_predict/train_data.txt"

graph_work_path: "./workdir"
sample_workers: 1
use_pyreader: true
input_type: "text"
cls_id: 1
lr: 0.01

# gat model setting
edge_dropout: 0.08
gat_num_layers: 2
feat_droupout: 0.09
attn_drop: 0.0
gat_hidden_size: 32
# model config ------
samples: [10]
model_type: "ERNIESageV1"

max_seqlen: 40

num_layers: 1
hidden_size: 128
final_fc: true
final_l2_norm: true
loss_type: "global_hinge"
margin: 0.1
neg_type: "batch_neg"

# infer config ------
#infer_model: "./output/last"
infer_batch_size: 128

# ernie config ------
encoding: "utf8"
# ernie_name: "ernie-1.0"
ernie_name: "ernie-tiny" #tiny is 4x faster than ernie1.0
#ernie_name: "./ernie1.0_model" # 不可连外网时,也可以把ERNIE github上的模型拉取到本地来热启

# runconfig 
model_dir: "./output"
max_steps: 10000
save_steps: 100
log_steps: 1
max_ckpt: 1
skip_steps: 0
eval_steps: 10000
eval_max_steps: 1000000
run_steps: 0

# hparam
warmup_proportion:  0.1
weight_decay: 0.01
learning_rate: 0.00005
log_prefix: "training"
suntao2015005848 commented 2 years ago

请问这个问题解决了吗