PaddlePaddle / models

Officially maintained, supported by PaddlePaddle, including CV, NLP, Speech, Rec, TS, big models and so on.
Apache License 2.0
6.91k stars 2.91k forks source link

deep_fm调用自己的数据时报错 #1352

Open Ericzhuu opened 6 years ago

Ericzhuu commented 6 years ago

我们的数据已通过/deep_fm/preprocess.py进行预处理,在调用train函数时报错,错误日志如下图, 1 源代码如下,

import paddle.v2 as paddle

factor_size = 5
batch_size = 128
num_passes = 10
dense_feature_dim = 30
sparse_feature_dim = 269

feeding = {
    'dense_input': 0,
    'sparse_input': 1,
    'C1': 2,
    'C2': 3,
    'C3': 4,
    'C4': 5,
    'C5': 6,
    'C6': 7,
    'C7': 8,
    'C8': 9,
    'C9': 10,
    'C10': 11,
    'C11': 12,
    'C12': 13,
    'C13': 14,
    'C14': 15,
    'C15': 16,
    'C16': 17,
    'C17': 18,
    'C18': 19,
    'C19': 20,
    'C20': 21,
    'C21': 22,
    'C22': 23,
    'C23': 24,
    'C24': 25,
    'C25': 26,
    'C26': 27,
    'C27': 28,
    'C28': 29,
    'C29': 30,
    'label': 31
}

class Dataset:
    def _reader_creator(self, path, is_infer):
        def reader():
            with open(path, 'r') as f:
                for line in f:
                    features = line.rstrip('\n').split('\t')
                    dense_feature = map(float, features[0].split(','))
                    sparse_feature = map(int, features[1].split(','))
                    if not is_infer:
                        label = [int(features[2])]
                        yield [dense_feature, sparse_feature
                               ] + sparse_feature + [label]
                    else:
                        yield [dense_feature, sparse_feature] + sparse_feature

        return reader

    def train(self, path):
        return self._reader_creator(path, False)

    def test(self, path):
        return self._reader_creator(path, False)

def fm_layer(input, factor_size, fm_param_attr):
    first_order = paddle.layer.fc(input=input,
                        size=1,
                        act=paddle.activation.Linear())
    second_order = paddle.layer.factorization_machine(
        input=input,
        factor_size=factor_size,
        act=paddle.activation.Linear(),
        param_attr=fm_param_attr)
    out = paddle.layer.addto(
        input=[first_order, second_order],
        act=paddle.activation.Linear(),
        bias_attr=False)
    return out

def DeepFM(factor_size, infer=False):
    dense_input = paddle.layer.data(
        name="dense_input",
        type=paddle.data_type.dense_vector(dense_feature_dim))
    sparse_input = paddle.layer.data(
        name="sparse_input",
        type=paddle.data_type.sparse_binary_vector(sparse_feature_dim))
    sparse_input_ids = [
        paddle.layer.data(
            name="C" + str(i),
            type=paddle.data_type.integer_value(sparse_feature_dim))
        for i in range(1, 30)
    ]
    dense_fm = fm_layer(
        dense_input,
        factor_size,
        fm_param_attr=paddle.attr.Param(name="DenseFeatFactors"))
    sparse_fm = fm_layer(
        sparse_input,
        factor_size,
        fm_param_attr=paddle.attr.Param(name="SparseFeatFactors"))
    def embedding_layer(input):
        return paddle.layer.embedding(
            input=input,
            size=factor_size,
            param_attr=paddle.attr.Param(name="SparseFeatFactors"))

    sparse_embed_seq = map(embedding_layer, sparse_input_ids)
    sparse_embed = paddle.layer.concat(sparse_embed_seq)
    fc1 = paddle.layer.fc(input=[sparse_embed, dense_input],
                          size=400,
                          act=paddle.activation.Relu())
    fc2 = paddle.layer.fc(input=fc1, size=400, act=paddle.activation.Relu())
    fc3 = paddle.layer.fc(input=fc2, size=400, act=paddle.activation.Relu())

    predict = paddle.layer.fc(input=[dense_fm, sparse_fm, fc3],
                              size=1,
                              act=paddle.activation.Sigmoid())
    if not infer:
        label = paddle.layer.data(
            name="label", type=paddle.data_type.integer_value(1))
        cost = paddle.layer.multi_binary_label_cross_entropy_cost(
            input=predict, label=label)
        paddle.evaluator.classification_error(
            name="classification_error", input=predict, label=label)
        paddle.evaluator.auc(name="auc", input=predict, label=label)
        return cost
    else:
        return predict

def train():

    paddle.init(use_gpu=False, trainer_count=1)

    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
    model = DeepFM(factor_size)
    params = paddle.parameters.create(model)
    trainer = paddle.trainer.SGD(cost=model, parameters=params, update_equation=optimizer)

    dataset = Dataset()

    def __event_handler__(event):
        if isinstance(event, paddle.event.EndIteration):
            num_samples = event.batch_id * batch_size
            if event.batch_id % 10 == 0:
                print "Pass %d, Batch %d, Samples %d, Cost %f, %s" % (event.pass_id, event.batch_id, num_samples,
                                event.cost, event.metrics)

            if event.batch_id % 100 == 0:
                result = trainer.test(
                   reader=paddle.batch(
                      dataset.test('data/valid.txt'),
                      batch_size=batch_size),
                      feeding=feeding)
                print "Test %d-%d, Cost %f, %s" % (event.pass_id, event.batch_id, result.cost,
                                    result.metrics)

                #path = "{}/model-pass-{}-batch-{}.tar.gz".format(
                #    model_output_dir, event.pass_id, event.batch_id)
                #with gzip.open(path, 'w') as f:
                #    trainer.save_parameter_to_tar(f)

    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
                dataset.train('data/train.txt'),
                buf_size=batch_size * 10000),
            batch_size=batch_size),
        feeding=feeding,
        event_handler=__event_handler__, 
        num_passes=num_passes)

if __name__ == '__main__':
    train()
NHZlX commented 6 years ago

初步的判断应该是训练的数据输入有问题,这个我会看一下。

Ericzhuu commented 6 years ago

我们预处理完的数据格式如下,

0,0.380042,0.110649,0.173158,0,0.166888,0,0.254955,0.042526,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.257143,0.166888,0.110649,0.257143   2,4,10,12,15,18,21,33,36,39,42,46,54,64,69,74,82,87,90,172,246,248,251,255,257,259,261,264,266  1
0,0.653928,0.110649,0.173158,0,0.166888,0,0.254955,0.056367,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.257143,0.166888,0.110649,0.257143   2,4,10,12,15,19,23,33,36,39,42,46,54,64,69,74,82,87,90,198,246,248,251,255,257,259,261,264,266  1
0,0.630573,0.097855,0.152105,0,0.147749,0,0.235603,0.034602,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.257143,0.147749,0.097855,0.257143   2,4,9,12,15,18,21,33,36,39,42,46,54,64,69,74,82,87,90,168,246,248,251,255,257,259,261,264,266   1
0.625,0.626327,0.094657,0.146842,0,0.142964,0,0.230766,0.027932,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.257143,0.142964,0.094657,0.257143   1,4,9,13,15,18,21,34,36,39,42,46,54,64,69,74,82,87,90,176,246,248,251,255,257,259,261,264,266   1
Ericzhuu commented 6 years ago

@NHZlX 您能分享一下deep_fm的原始训练数据吗,链接已经失效了