PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.27k stars 5.6k forks source link

TypeError: reduce() of empty sequence with no initial value #8734

Closed JessieMeng closed 6 years ago

JessieMeng commented 6 years ago
import os, sys
import gzip
import paddle.v2 as paddle
import numpy as np
import functools
import argparse
from data_input import *

def lambda_rank(title_input_dim,label_input_dim):
    """
    lambda_rank is a Listwise rank model, the input data and label must be sequences.
    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
    parameters :
      input_dim, one document's dense feature vector dimension

    format of the dense_vector_sequence:
    [[f, ...], [f, ...], ...], f is a float or an int number
    """
    label = paddle.layer.data("label",
                              paddle.data_type.dense_vector_sequence(1))

    title_data = paddle.layer.data("title_data",
                             paddle.data_type.integer_value(title_input_dim))
    title_emb = paddle.layer.embedding(input=title_data,size=256)

    label_data = paddle.layer.data("label_data",
                            paddle.data_type.integer_value(label_input_dim))
    label_emb = paddle.layer.embedding(input=label_data,size=256)

    combine_data = paddle.layer.fc(input=[title_emb,label_emb],size=128,act=paddle.activation.Relu())
    # hidden layer
    title_hd1 = paddle.layer.fc(
        input=combine_data,
        size=10,
        act=paddle.activation.Tanh())
    output = paddle.layer.fc(
        input=title_hd1,
        size=1,
        act=paddle.activation.Linear(),
        param_attr=paddle.attr.Param(initial_std=0.01))

    # evaluator
    evaluator = paddle.evaluator.auc(input=output, label=label)
    # cost layer
    cost = paddle.layer.lambda_cost(
        input=output, score=label, NDCG_num=6, max_sort_size=-1)
    return cost, output

def train_lambda_rank(num_passes):
    # listwise input sequence
    fill_default_train = functools.partial(
        train)
    fill_default_test = functools.partial(
        test)
    train_reader = paddle.batch(
        paddle.reader.shuffle(fill_default_train, buf_size=1000), batch_size=32)
    test_reader = paddle.batch(fill_default_test, batch_size=32)
    # mq2007 input_dim = 46, dense format
    title_input_dim = 358561
    label_input_dim = 6116  
    cost, output = lambda_rank(title_input_dim,label_input_dim)
    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))

    #  Define end batch and end pass event handler
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
                                                  event.cost)
        if isinstance(event, paddle.event.EndPass):
            result = trainer.test(reader=test_reader, feeding=feeding)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
            with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id),
                           "w") as f:
                parameters.to_tar(f)

    feeding = {"label": 0, "title_data": 1,"label_data":2}
    trainer.train(
        reader=paddle.batch(paddle.reader.shuffle(train(),buf_size=100),batch_size=32),
        event_handler=event_handler,
        feeding=feeding,
        num_passes=num_passes)

def lambda_rank_infer(pass_id):
    """
  lambda_rank model inference interface
  parameters:
    pass_id : inference model in pass_id
  """
    print "Begin to Infer..."
    input_dim = 46
    output = lambda_rank(input_dim)
    parameters = paddle.parameters.Parameters.from_tar(
        gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1)))

    infer_query_id = None
    infer_data = []
    infer_data_num = 1
    fill_default_test = functools.partial(
        paddle.dataset.mq2007.test, format="listwise")
    for label, querylist in fill_default_test():
        infer_data.append(querylist)
        if len(infer_data) == infer_data_num:
            break

    # predict score of infer_data document. Re-sort the document base on predict score
    # in descending order. then we build the ranking documents
    predicitons = paddle.infer(
        output_layer=output, parameters=parameters, input=infer_data)
    for i, score in enumerate(predicitons):
        print i, score

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='LambdaRank demo')
    parser.add_argument("--run_type", type=str, help="run type is train|infer")
    parser.add_argument(
        "--num_passes",
        type=int,
        help="num of passes in train| infer pass number of model")
    args = parser.parse_args()
    paddle.init(use_gpu=False, trainer_count=1)
    if args.run_type == "train":
        train_lambda_rank(args.num_passes)
    elif args.run_type == "infer":
        lambda_rank_infer(pass_id=args.num_passes - 1)
#-*- coding:utf-8
import sys
import random
import functools
__all__=['train','test']

def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    rand = random.Random(x=rand_seed)
    with open("data_seg_id_result") as f:
        for line in f:
            if (rand.random() < test_ratio) == is_test:
                tokens = line.strip().split("\t")
                print "all:",len(tokens)
                score = int(tokens[0])
                words = tokens[1].split(" ")
                new_word_list=[]
                print "word:",len(words)
                for word in words:
                    if word=='':
                        continue
                    new_word_list.append(int(word))
                lable=int(tokens[2])
                #print [[score]]+[new_word_list]+[lable]
                yield [[score]]+[new_word_list]+[lable]

def __reader_creator__(**kwargs):
    return lambda: __reader__(**kwargs)
train = functools.partial(__reader_creator__,is_test=False)
test = functools.partial(__reader_creator__,is_test=True)
JessieMeng commented 6 years ago

报错信息 Traceback (most recent call last): File "lambda_rank.py", line 130, in train_lambda_rank(args.num_passes) File "lambda_rank.py", line 87, in train_lambda_rank num_passes=num_passes) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/paddle/v2/trainer.py", line 153, in train in_args = feeder(data_batch) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/py_paddle/dataprovider_converter.py", line 278, in call return self.convert(dat, argument) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/paddle/v2/data_feeder.py", line 134, in convert return DataProviderConverter.convert(self, reorder_data(dat), argument) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/py_paddle/dataprovider_converter.py", line 263, in convert scanner.pre_scan(each_step) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/py_paddle/dataprovider_converter.py", line 221, in pre_scan self.__inner_scanner.pre_scan(each) File "/home/disk3/mengziyang/paddle/python27/lib/python2.7/site-packages/py_paddle/dataprovider_converter.py", line 112, in pre_scan self.dim = reduce(lambda x, y: x * y, self.shape__)