PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
21.94k stars 5.52k forks source link

mpi 训练中报错, gradient_clip clip_by_norm算子 的输出含有nan #21229

Closed maosengshulei closed 1 year ago

maosengshulei commented 4 years ago

模型代码如下 `# !/usr/bin/python

coding=utf-8

import paddle import paddle.fluid as fluid from functools import reduce from itertools import starmap import numpy as np import math

def DNN(args, feat_list):

dense_feature_dim = reduce(lambda x,y:x+y,[ 1 for _,feat in feat_list.items() if feat.value_type == 0])

dense_feature = fluid.layers.data(name="dense_input", shape=[dense_feature_dim], dtype='float32')

sparse_feature = [fluid.layers.data(name=feat.prefix, shape=[1], lod_level=1, dtype='int64')  for _, feat in feat_list.items() if feat.value_type == 1]
sparse_weight_feature = [fluid.layers.data(name=feat.prefix + '@index', shape=[1], lod_level=1, dtype='int64')  for _, feat in feat_list.items() if feat.value_type == 2]
sparse_weight_value = [fluid.layers.data(name=feat.prefix + '@value', shape=[1], lod_level=1, dtype='float32')  for _, feat in feat_list.items() if feat.value_type == 2]

label = fluid.layers.data(name='label', shape=[1], dtype='int64')

duration = fluid.layers.data(name='duration', shape=[1], dtype='float32')

data_list = [dense_feature] + sparse_feature + sparse_weight_feature + sparse_weight_value + [label] + [duration]

if args.is_infer:
    data_list = [dense_feature] + sparse_feature + sparse_weight_feature + sparse_weight_value

sparse_feature_names = [feat.prefix for _, feat in feat_list.items() if feat.value_type == 1] 
sparse_feature_size = [ feat.dimension for _, feat in feat_list.items() if feat.value_type == 1]
sparse_feature_embsize = [feat.emb_size for _,feat in feat_list.items() if feat.value_type == 1]
sparse_weight_feature_names = [feat.prefix for _, feat in feat_list.items() if feat.value_type == 2] 
sparse_weight_feature_size = [ feat.dimension for _, feat in feat_list.items() if feat.value_type == 2]
sparse_weight_feature_embsize = [feat.emb_size for _,feat in feat_list.items() if feat.value_type == 2]

def embedding_layer(input, attr_name, input_size, emb_size):
    """embedding_layer"""
    param_name = attr_name + '_table'
    sparse_field_param_attr = fluid.param_attr.ParamAttr(name=param_name, initializer=fluid.initializer.Normal(scale=1 / math.sqrt(input_size)))
    emb = fluid.layers.embedding(input=input, dtype='float32', size=[input_size, emb_size], param_attr=sparse_field_param_attr, is_sparse=True)
    return fluid.layers.sequence_pool(input=emb, pool_type='Max')

def mmoe_block(input,expert_num,task_num):
    input_dim = input.shape[1]

    experts_input_list = [input] * expert_num
    experts_input = fluid.layers.stack(experts_input_list,axis=1)
    temp_experts_input = fluid.layers.unsqueeze(input=experts_input,axes=[2])
    experts_input_matrix = fluid.layers.transpose(temp_experts_input, perm=[2, 1, 0, 3]) #[1,expert_num,batch_size,input_dim]

    w = fluid.layers.create_parameter(shape=[expert_num, input_dim, input_dim//2], dtype='float32',
            attr=fluid.param_attr.ParamAttr(name='mmoe_first_layer',gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
            default_initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (input_dim ** 0.5)))
    w1 = fluid.layers.unsqueeze(input=w,axes=[0])
    #w1 = fluid.layers.stack([w]*batch_size,axis=0)
    hidden1 = fluid.layers.matmul(experts_input_matrix, w1)
    #hidden1_activation = fluid.layers.relu(hidden1) #[1,expert_num,batch_size,input_dim//2]

    hidden1_flatten = fluid.layers.reshape(fluid.layers.transpose(fluid.layers.squeeze(hidden1,axes=[0]), perm=[1,0,2]), shape=[-1, expert_num * (input_dim//2)])

    hidden1_batchnorm = fluid.layers.batch_norm(input=hidden1_flatten, act='relu', is_test=not args.is_train, name='mmoe_hidden1_batchnorm') #[batch_size, expert_num*input_dim//2]

    hidden1_output = fluid.layers.unsqueeze(fluid.layers.transpose(fluid.layers.reshape(hidden1_batchnorm, shape=[-1, expert_num, input_dim//2]), perm=[1,0,2]), axes=[0]) #[1, expert_num, batch_size, input_dim//2]

    v = fluid.layers.create_parameter(shape=[expert_num, input_dim//2, input_dim//4], dtype='float32',
            attr=fluid.param_attr.ParamAttr(name='mmoe_second_layer',gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
            default_initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / ((input_dim//2) ** 0.5)))
    v1 = fluid.layers.unsqueeze(input=v,axes=[0])
    #v1 = fluid.layers.stack([v]*batch_size,axis=0)

    hidden2 = fluid.layers.matmul(hidden1_output, v1)
    #hidden2_activation = fluid.layers.relu(hidden2) #[1,expert_num,batch_size,output_dim]
    hidden2_flatten = fluid.layers.reshape(fluid.layers.transpose(fluid.layers.squeeze(hidden2,axes=[0]), perm=[1,0,2]), shape=[-1, expert_num * (input_dim//4)])

    hidden2_batchnorm = fluid.layers.batch_norm(input=hidden2_flatten, act='relu', is_test=not args.is_train, name='mmoe_hidden2_batchnorm') #[batch_size, expert_num*input_dim//4]

    hidden2_output = fluid.layers.reshape(hidden2_batchnorm, shape=[-1, expert_num, input_dim//4])

    #output = fluid.layers.squeeze(hidden2_transpose,axes=[2]) #[batch_size,expert_num,output_dim] 
    #gated_kernels = []
    final_output = []

    for i in range(task_num):
        fc = fluid.layers.fc(name='gate_kernel_task_fc_{}'.format(i),input=input, size=expert_num)

        gated_softmax_output = fluid.layers.softmax(input=fc, name='gate_kernel_task_softmax_{}'.format(i), axis=1)#[batch_size,expert_num]

        #fluid.layers.Print(gated_softmax_output)

        weighted_output = fluid.layers.elementwise_mul(hidden2_output, gated_softmax_output, axis=0)#[batch_size,expert_num,output_dim]
        agregate_output = fluid.layers.reduce_sum(weighted_output, dim=1, name='gate_kernel_task_agregate_{}'.format(i)) 

        final_output.append(agregate_output)

    return final_output

sparse_weight_embedding_list = []
for raw_sparse_weight_feat, raw_sparse_weight_val, feature_name, feature_dimension, feature_embsize in zip(sparse_weight_feature, sparse_weight_value, sparse_weight_feature_names, sparse_weight_feature_size, sparse_weight_feature_embsize):
    param_name = feature_name + '_table'
    sparse_weight_param_attr = fluid.param_attr.ParamAttr(name=param_name, initializer=fluid.initializer.Normal(scale=1 / math.sqrt(feature_dimension)),gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
    #sparse_weight_val = fluid.layers.unsqueeze(input=raw_sparse_weight_val,axes=[2])
    emb = fluid.layers.embedding(input=raw_sparse_weight_feat, dtype='float32', size=[feature_dimension, feature_embsize],param_attr=sparse_weight_param_attr,is_sparse=True) 

    weighted_sparse_weight_embedding = emb * raw_sparse_weight_val
    sparse_weight_embedding = fluid.layers.sequence_pool(input=weighted_sparse_weight_embedding, pool_type='Max')
    sparse_weight_embedding_list.append(sparse_weight_embedding)

sparse_embedding_list = list(starmap(embedding_layer, zip(sparse_feature, sparse_feature_names, sparse_feature_size,  sparse_feature_embsize)))

concated = fluid.layers.concat(sparse_embedding_list + sparse_weight_embedding_list + [dense_feature], axis=1)

united_fc = fluid.layers.fc(input = concated, size = 512 ,act = None,param_attr = \
                           fluid.ParamAttr(learning_rate=1.0, \
                           initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (512 ** 0.5)),
                           gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                           bias_attr = \
                           fluid.ParamAttr(learning_rate=1.0, \
                           initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (512 ** 0.5)),
                           gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                           name='united_fc')
united_bn = fluid.layers.batch_norm(input=united_fc, act='relu', is_test=not args.is_train, name='united_bn')

fluid.layers.Print(united_fc, summarize=512)
fluid.layers.Print(united_bn, summarize=512)

ctr_fc, duration_fc = mmoe_block(united_bn,args.expert_num,2)
fluid.layers.Print(ctr_fc, summarize=128)

ctr_fc1 = fluid.layers.fc(input = ctr_fc,size = 64,act = None,
                         param_attr = \
                         fluid.ParamAttr(learning_rate=1.0, \
                         initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64 ** 0.5)),
                         gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                         bias_attr = \
                         fluid.ParamAttr(learning_rate=1.0, \
                         initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64 ** 0.5)),
                         gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                         name='ctr_fc1')
ctr_bn1 = fluid.layers.batch_norm(input=ctr_fc1, act='relu', is_test=not args.is_train,name='ctr_bn1')

duration_fc1 = fluid.layers.fc(input = duration_fc,size = 64,act = None,
                         param_attr = \
                         fluid.ParamAttr(learning_rate=1.0, \
                         initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64 ** 0.5)),
                         gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                         bias_attr = \
                         fluid.ParamAttr(learning_rate=1.0, \
                         initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=0.2 / (64 ** 0.5)),
                         gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)),
                         name='duration_fc1')
duration_bn1 = fluid.layers.batch_norm(input=duration_fc1, act='relu', is_test=not args.is_train,name='duration_bn1')
fluid.layers.Print(ctr_fc1, summarize=64)
fluid.layers.Print(ctr_bn1, summarize=64)
ctr_predict = fluid.layers.fc(input=ctr_bn1, size=2, act="softmax", 
                        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(ctr_bn1.shape[1])),
                        gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)), name='ctr_predict') 
fluid.layers.Print(ctr_predict)

dura_predict = fluid.layers.fc(input=duration_bn1, size=1, act=None, 
                        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(scale=1 / math.sqrt(duration_bn1.shape[1])),
                        gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)), name='dura_predict')
#fluid.layers.Print(dura_predict)

if args.is_train:
    ctr_cost = fluid.layers.cross_entropy(input=ctr_predict, label=data_list[-2])
    avg_ctr_cost = fluid.layers.reduce_sum(ctr_cost)
    dura_cost = fluid.layers.square_error_cost(input=dura_predict, label=data_list[-1])
    avg_dura_cost = fluid.layers.reduce_sum(dura_cost) 
    total_cost = avg_ctr_cost + 0.05 * avg_dura_cost
    accuracy = fluid.layers.accuracy(input=ctr_predict, label=data_list[-2])
    auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=ctr_predict, label=data_list[-2], num_thresholds=2 ** 12, slide_steps=20)

    return total_cost, avg_ctr_cost, avg_dura_cost, auc_var, batch_auc_var, accuracy, ctr_predict, dura_predict, data_list 
elif args.is_infer:
    return ctr_predict, dura_predict, data_list
else:
    ctr_cost = fluid.layers.cross_entropy(input=ctr_predict, label=data_list[-2])
    avg_ctr_cost = fluid.layers.reduce_sum(ctr_cost)
    dura_cost = fluid.layers.square_error_cost(input=dura_predict, label=data_list[-1])
    avg_dura_cost = fluid.layers.reduce_sum(dura_cost) 
    total_cost = avg_ctr_cost + 0.05 * avg_dura_cost 
    accuracy = fluid.layers.accuracy(input=ctr_predict, label=data_list[-2])
    auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=ctr_predict, label=data_list[-2], num_thresholds=2 ** 12, slide_steps=20)

    return total_cost, avg_ctr_cost, avg_dura_cost, auc_var, batch_auc_var, accuracy, ctr_predict, dura_predict, data_list`

训练分别测试使用了SGD,ADAM优化器,学习率设置为1e-6,这种情况下一直报错 PaddleCheckError: Operator clip_by_norm output Tensor clip_by_norm_63.tmp_0 contains NAN at [/paddle/paddle/fluid/framework/operator.cc:848] [operator < clip_by_norm > error]

seiriosPlus commented 4 years ago

去掉clip试一下?

maosengshulei commented 4 years ago

去掉clip试一下?

加上clip的原因就是之前embedding层出现INF。之前经常因此导致预测值变为nan,报错。

seiriosPlus commented 4 years ago

单机训练是否出inf/nan

maosengshulei commented 4 years ago

单机训练是否出inf/nan

单机正常训练,mpi只用部分数据(30%)偶尔出现nan,全量必出现。

seiriosPlus commented 4 years ago

单机加上clip后还出nan么

maosengshulei commented 4 years ago

单机加上clip后还出nan么

单机加clip之前也不出现nan,加上之后也正常,但也有可能是数据量少。

maosengshulei commented 4 years ago

补充说明: mpi训练 embedding层不加gradient_clip 报错:Operator adam output Tensor match&seccate#bayes_ctr_table_moment2_0 contains Inf at [/paddle/paddle/fluid/framework/operator.cc:846]

添加gradient_clip后 报错:Operator clip_by_norm output Tensor clip_by_norm_52.tmp_0 contains NAN at [/paddle/paddle/fluid/framework/operator.cc:848]

maosengshulei commented 4 years ago

补充:mpi单节点训练正常,多节点训练必现以上报错。