PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.14k stars 5.56k forks source link

集群训练pairwise ranking任务,打印pos/neg和cost为0 #6570

Closed 20092136 closed 6 years ago

20092136 commented 6 years ago

相同配置下,单机运行是能正常打印指标的,但观察到集群训练的训练日志里,指标异常如下:

I1213 14:44:38.848351 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848542 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848572 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848592 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848611 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848630 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848650 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848671 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848692 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848714 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848737 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848758 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848779 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848800 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848821 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:38.848843 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 14:44:45.373313 30749 TrainerInternal.cpp:181] Pass=2 Batch=13963 samples=1787251 AvgCost=0 Eval:

我的配置如下:

def dssm_data(
                    data_dir=None,
                   is_test=False,
                   is_predict=False,
                   train_list="train.list",
                   test_list="test.list",
                   predict_list="predict.list",
                   dict_file="75_53_27_9-11_wise_clean.txt",
                   async_load_data=True):
    """
    provide data
    """
    dict_dim = len(open(join_path(data_dir, dict_file)).readlines())

    if is_predict:
        return dict_dim
    if data_dir is not None:
        train_list = join_path(data_dir, train_list)
        test_list = join_path(data_dir, test_list)
        dict_file = join_path(data_dir, dict_file)

    train_list = train_list if not is_predict else predict_list
    train_list = train_list if not is_test else None

    define_py_data_sources2(train_list, None,
            module="dataprovider", obj="process",
            args={'dictionary': dict_file})

    return dict_dim

网络配置

def softsign(layeroutput):
    # |x|
    with mixed_layer(act=AbsActivation()) as absl:
        absl += identity_projection(layeroutput)
    # 1 + |x|
    denominator = layer_math.add(absl, 1.0)
    # log(1 + |x|)
    with mixed_layer(act=LogActivation()) as log_denom:
        log_denom += identity_projection(denominator)
    # exp(-log(1 + |x|))
    with mixed_layer(act=ExpActivation()) as exp_log_denom:
        exp_log_denom += identity_projection(layer_math.mul(log_denom, -1.0))
    # x / 1 + |x|
    with mixed_layer() as softsign:
        softsign += dotmul_operator(a=exp_log_denom, b=layeroutput, scale=1.0)
    return softsign

def make_emb(data, dim, prefix):
    return embedding_layer(input=data, size=dim,  param_attr=ParamAttr(name='%s_emb.w' % prefix, learning_rate=5e-5, sparse_update=True))

def make_bow(emb):
    return pooling_layer(input=emb, pooling_type=SumPooling())

def dssm_net(dict_dim,
                emb_dim=128,
                is_predict=False):
    """
    querysim network
    """
    linear = LinearActivation()
    relu = ReluActivation()
    tanh = TanhActivation()
    share_semantic_generator = True    
    prefix = '_ _'.split() if share_semantic_generator else 'query title'.split()

    q_data = data_layer('query', dict_dim)
    q_emb = make_emb(q_data, emb_dim, 'pairwise')
    q_bow = make_bow(q_emb)
    q_ss = softsign(q_bow)
    q_fc = fc_layer(input=q_ss, size=128, act=linear, param_attr=ParamAttr(name='_qfc.w', learning_rate=2e-3))

    pos_data = data_layer('pos_data', dict_dim)
    pos_emb = make_emb(pos_data, emb_dim, 'pairwise')
    pos_bow = make_bow(pos_emb)
    pos_ss = softsign(pos_bow)
    pos_fc = fc_layer(input=pos_ss, size=128, act=linear, param_attr=ParamAttr(name='_tfc.w', learning_rate=2e-3))

    pos_score = fc_layer(input=[q_fc, pos_fc], size=1, param_attr=ParamAttr(name='_score_fc.w', learning_rate=2e-3))

    if is_predict:    
        outputs(pos_score)
    else:
        neg_data = data_layer('neg_data', dict_dim)
        neg_emb = make_emb(neg_data, emb_dim, 'pairwise')
        neg_bow = make_bow(neg_emb)
        neg_ss = softsign(neg_bow)
        neg_fc = fc_layer(input=neg_ss, size=128, act=linear, param_attr=ParamAttr(name='_tfc.w', learning_rate=2e-3))

        neg_score = fc_layer(input=[q_fc, neg_fc], size=1, param_attr=ParamAttr(name='_score_fc.w', learning_rate=2e-3))

        label = data_layer('label', 1)
        outputs(rank_cost(left=pos_score, right=neg_score, label=label))

训练配置:

import time
time_output = time.strftime("%Y%m%d%H%M%S",time.localtime())
cluster_config(
    fs_name="hdfs://xxx",
    fs_ugi="abc,abc",
    output_path=xxx,
    train_data_path=xxx,
    test_data_path=xxx,
    save_dir='output',
    ports_num=8,
    use_remote_sparse=1,
    port=7900,
    has_meta_data = False,
    )

from dssm_net import *
from paddle.trainer_config_helpers import *

data_dir = "./"
# whether this config is used for test
is_test = False
# whether this config is used for prediction
is_predict = False

dict_dim = dssm_data(data_dir, is_test, is_predict, 
train_list='train.list', test_list='test.list', dict_file='75_53_27_9-11_wise_clean.dict')

################## Algorithm Config #####################

settings(
        batch_size=128, learning_rate=1e-5,
         learning_method=AdaGradOptimizer(),
         gradient_clipping_threshold=15,
         regularization=L2Regularization(1e-7))

#################### Network Config ######################
dssm_net(1550000)

训练命令:

paddle cluster_train \
  --config=test/dssm/cluster_job_config/job_config.py \
  --use_gpu=cpu \
  --num_nodes=100 \
  --num_passes=150 \
  --log_period=999999999 \
  --dot_period=1000 \
  --trainer_count=16 \
  --saving_period=1 \
  --thirdparty=./test/dssm/thirdparty \
  --job_name=nlp_dssm_bow_sync \
  --time_limit=700:00:00 \
  --submitter=huangwenzhi \
  --config_args=is_local=0 \
  --where=xxx
20092136 commented 6 years ago

I1213 16:58:21.991075 31368 CostLayer.cpp:337] calc pos/neg: 40.1701 pos= 1.74596e+06 neg= 43464

I1213 17:47:35.715744 31368 CostLayer.cpp:337] calc pos/neg: 89566.7 pos= 1.79133e+06 neg= 20

补充信息,每个节点改为1个trainer的时候能够输出信息,但pos/neg看起来有问题

lcy-seso commented 6 years ago

你好,我想先问几个问题。

  1. 上面提到:

    相同配置下,单机运行是能正常打印指标的 能否提供一个正常的日志看一下现在你所期待的“正常”是什么样子。

  2. 上面提供的出错日志已经到pass2,请问是整个训练的一开始(第一个batch)就出现 pos/neg=0,还是会逐渐降到0。

20092136 commented 6 years ago

I1215 14:22:11.560315 24762 Util.cpp:166] commandline: /home/huangwenzhi/paddle/usr/local/bin/../opt/paddle/bin/paddle_trainer --config=./job_config.py --use_gpu=false --num_passes=20 --log_period=200 --test_period=5000 --dot_period=20 --trainer_count=4 --saving_period=1 --save_dir=./modelout --config_args=is_local=1 [INFO 2017-12-15 14:22:12,054 networks.py:1459] The input order is [query, pos_data, neg_data, label] [INFO 2017-12-15 14:22:12,054 networks.py:1465] The output order is [rank_cost_0] I1215 14:22:12.059198 24762 Trainer.cpp:114] ignore sparse_remote_update=true due to --local=true I1215 14:22:12.059231 24762 Trainer.cpp:162] trainer mode: SgdSparseCpuTraining I1215 14:22:13.091187 24762 PyDataProvider2.cpp:243] loading dataprovider dataprovider::process [INFO 2017-12-15 14:22:16,707 dataprovider.py:53] dict len : 1549030 I1215 14:22:16.707571 24762 GradientMachine.cpp:85] Initing parameters.. I1215 14:22:33.421447 24762 GradientMachine.cpp:92] Init parameters done. ......... I1215 14:22:48.112035 24762 TrainerInternal.cpp:165] Batch=200 samples=25600 AvgCost=0.691248 CurrentCost=0.691248 Eval: CurrentEval: ......... I1215 14:22:59.665647 24762 TrainerInternal.cpp:165] Batch=400 samples=51200 AvgCost=0.690424 CurrentCost=0.689601 Eval: CurrentEval: ......... I1215 14:23:11.698137 24762 TrainerInternal.cpp:165] Batch=600 samples=76800 AvgCost=0.689806 CurrentCost=0.68857 Eval: CurrentEval: ......... I1215 14:23:23.848870 24762 TrainerInternal.cpp:165] Batch=800 samples=102400 AvgCost=0.689285 CurrentCost=0.687722 Eval: CurrentEval: ......... I1215 14:23:35.585708 24762 TrainerInternal.cpp:165] Batch=1000 samples=128000 AvgCost=0.688825 CurrentCost=0.686986 Eval: CurrentEval: ......... I1215 14:23:47.754395 24762 TrainerInternal.cpp:165] Batch=1200 samples=153600 AvgCost=0.688416 CurrentCost=0.686369 Eval: CurrentEval: ..I1215 14:23:50.455447 24762 CostLayer.cpp:337] calc pos/neg: 4.46556 pos= 32612 neg= 7303 I1215 14:23:50.455499 24762 CostLayer.cpp:337] calc pos/neg: 4.4762 pos= 32627 neg= 7289 I1215 14:23:50.455509 24762 CostLayer.cpp:337] calc pos/neg: 4.56321 pos= 32741 neg= 7175 I1215 14:23:50.455518 24762 CostLayer.cpp:337] calc pos/neg: 4.56011 pos= 32737 neg= 7179 I1215 14:23:50.920789 24762 TrainerInternal.cpp:181] Pass=0 Batch=1248 samples=159663 AvgCost=0.68833 Eval: I1215 14:23:50.921814 24762 GradientMachine.cpp:63] Saving parameters to ./modelout/pass-00000 ......... I1215 14:24:05.570590 24762 TrainerInternal.cpp:165] Batch=200 samples=25600 AvgCost=0.685831 CurrentCost=0.685831 Eval: CurrentEval: ......... I1215 14:24:17.453933 24762 TrainerInternal.cpp:165] Batch=400 samples=51200 AvgCost=0.685475 CurrentCost=0.685119 Eval: CurrentEval: ......... I1215 14:24:29.983072 24762 TrainerInternal.cpp:165] Batch=600 samples=76800 AvgCost=0.685187 CurrentCost=0.684611 Eval: CurrentEval: ......... I1215 14:24:41.913007 24762 TrainerInternal.cpp:165] Batch=800 samples=102400 AvgCost=0.684927 CurrentCost=0.684146 Eval: CurrentEval: ......... I1215 14:24:53.798105 24762 TrainerInternal.cpp:165] Batch=1000 samples=128000 AvgCost=0.68468 CurrentCost=0.683694 Eval: CurrentEval: ......... I1215 14:25:06.505194 24762 TrainerInternal.cpp:165] Batch=1200 samples=153600 AvgCost=0.684446 CurrentCost=0.683273 Eval: CurrentEval: ..I1215 14:25:08.867113 24762 CostLayer.cpp:337] calc pos/neg: 18.9514 pos= 37846 neg= 1997 I1215 14:25:08.867177 24762 CostLayer.cpp:337] calc pos/neg: 17.8924 pos= 37735 neg= 2109 I1215 14:25:08.867188 24762 CostLayer.cpp:337] calc pos/neg: 17.9278 pos= 37738 neg= 2105 I1215 14:25:08.867197 24762 CostLayer.cpp:337] calc pos/neg: 18.541 pos= 37805 neg= 2039 I1215 14:25:09.204318 24762 TrainerInternal.cpp:181] Pass=1 Batch=1246 samples=159374 AvgCost=0.684394 Eval: I1215 14:25:09.205173 24762 GradientMachine.cpp:63] Saving parameters to ./modelout/pass-00001

  1. 以上是我在单机cpu4个trainner的输出,所谓”正常“就是pos/neg,cost都有非0值
  2. 集群训练的时候,我设置了每个pass仅在训练完成时打印一次结果,从pass0开始,就是pos/neg=0
20092136 commented 6 years ago

另外还有一个信息,这份数据全量有10000个part,此前我在相同的数据下用厂内另一个平台训过这个模型。相同的batch_size,相同的lr,几乎一致的网络结构,相同的样本构造和采样方法,除了

  1. paddle中的fc+rankcost换成了cosine+hingeloss
  2. 该版本paddle中不支持而用layer_math凑出来的softsign换成了另一个平台的softsign

训练的模型中,pos/neg从第一个pass的1.5逐渐增长到150个pass以后的3.0,但paddle在单机训练的时候,用的是全量数据的千分之一(10个part),pos/neg头两个pass分别得到4.x和17.x

我不太确定这个数据量的缩减是否问题关键,仅觉得这个增幅有点不合理,所以想提供下这个信息

lcy-seso commented 6 years ago

请问:

集群训练的时候,我设置了每个pass仅在训练完成时打印一次结果,从pass0开始,就是pos/neg=0

这时候 Cost 的下降是否正常呢?是否正常收敛。

20092136 commented 6 years ago

I1213 13:14:55.372691 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.372946 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.372975 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.372998 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373028 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373044 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373073 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373095 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373116 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373144 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373172 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373212 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373232 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373253 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373273 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:55.373294 30749 CostLayer.cpp:337] calc pos/neg: 0 pos= 0 neg= 0 I1213 13:14:57.311453 30749 TrainerInternal.cpp:181] Pass=0 Batch=13994 samples=1791175 AvgCost=0 Eval:

pass0的时候输出如上

lcy-seso commented 6 years ago

你好,从你上面的描述中,想起来我之前的一段经历,曾经因为工作需要对齐过Paddle和厂内另一个训练平台训练 pairwise ranking 任务。在当时的情况下, Paddle 没有实现 softsign (现在已经有了) 和 hingeloss。

https://github.com/lcy-seso/paddle_confs_v1/blob/master/ltr/pairwise_ltr.conf 这个配置就是当时任务最后留下的配置和超参数(效果几乎一致, 等于 softsign + hinge loss 的那个模型)。配置中的超参数是单机4卡GPU,如果是集群训练,batch size 和 learning rate 可能需要针对性再调一下。供参考。

lcy-seso commented 6 years ago

正负例都是0这个确实非常奇怪,我们再想想。

lcy-seso commented 6 years ago

训练的模型中,pos/neg从第一个pass的1.5逐渐增长到150个pass以后的3.0,但paddle在单机训练的时候,用的是全量数据的千分之一(10个part),pos/neg头两个pass分别得到4.x和17.x

这个不知道是不是指两个平台的直接输出指标?我觉得是不同平台之间具体计算方式的差异,之前我也有遇到。最后使用的评估方式是使用同样的测试集,线下以同一指标来进行评估。

20092136 commented 6 years ago

@lcy-seso 您提供的信息对我来说很有帮助,十分感谢。我尝试按你说的评估方式来做。