PaddlePaddle / Paddle

PArallel Distributed Deep LEarning: Machine Learning Framework from Industrial Practice (『飞桨』核心框架,深度学习&机器学习高性能单机、分布式训练和跨平台部署)
http://www.paddlepaddle.org/
Apache License 2.0
22.16k stars 5.57k forks source link

预测时候报错 size!=0 #4307

Closed JessieMeng closed 6 years ago

JessieMeng commented 7 years ago
#opyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Movielens 1-M dataset.

Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
movies, which was collected by GroupLens Research. This module will download
Movielens 1-M dataset from 
http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
set and test set into paddle reader creators.

"""
import time
import zipfile
import paddle.v2.dataset.common
import re
import random
import functools
import numpy as np
__all__ = [
    'train','test','movie_size','max_movie_id','user_size','MOVIE_INFO','Category_dict','USER_DICT','convert','predict_test','predict_label','id_size','USER_FEATURE_DICT'
]

class MOVIEinfo(object):
    def __init__(self,movie_id,category):
        self.movie_id = movie_id
        if category =="":
            self.category = "null"
        else:
            self.category = category
        '''
        self.titles=[]
        if len(titles)==1:
            self.titles=[]
        else:
            for title in titles:
                title1=int(title)
                self.titles.append(title1)
        self.pic_words = pic_words
        self.articals = articals
        '''
    def value(self):
        '''
        pic_list=[]
        for pic_key in self.pic_words:
            if PIC_ATTENT_DICT.has_key(pic_key):
                pic_list.append(PIC_ATTENT_DICT[pic_key])
        artical_attent_list = []

        for artical_attent_key in self.articals:
            if ARTICAL_ATTENT_DICT.has_key(artical_attent_key):
                artical_attent_list.append(ARTICAL_ATTENT_DICT[artical_attent_key])
        '''
        return [self.movie_id, Category_dict[self.category]]

MOVIE_INFO=None
USER_DICT=None
Category_dict=None
ID_FEA_dict=None
PIC_ATTENT_DICT=None
ARTICAL_ATTENT_DICT=None
USER_FEATURE_DICT=None
def __initialize_meta_info__():
        global Category_dict
        global MOVIE_INFO
        global USER_DICT
        global ID_FEA_dict
        global PIC_ATTENT_DICT
        global ARTICAL_ATTENT_DICT
        global USER_FEATURE_DICT
        if MOVIE_INFO is None:
            pattern = re.compile(r'\d+')
            MOVIE_INFO = dict()
            movie_set = set()
            user_set = set()
            Category_dict = dict()
            user_feature_set = set()
            with open('train_2/data_2.set') as movie_file:
                for i, line in enumerate(movie_file):
                    movie_ids= line.strip().split('\t')
                    movie_id = movie_ids[0]
                    category = movie_ids[1]
                    '''
                    #titles = movie_ids[3].split("\001")
                    #picture_num = int(movie_ids[4])
                    #picture_clear = float(movie_ids[5])
                    #picture_beauty = float(movie_ids[6])
                    pic_words=[]
                    if movie_ids[7]=="":
                        pic_words = []
                    else:
                        pic_attents = movie_ids[7].split(";")
                        for pic_key in pic_attents:
                            pic_word,pic_score = pic_key.split(":")
                            pic_words.append(pic_word)
                    if movie_ids[8]=="":
                        articals = []
                    else:
                        articals = movie_ids[8].split(",")
                    '''
                    if category=="":
                        category="null"
                    movie = int(pattern.search(movie_id).group())
                    movie_set.add(movie)
                    MOVIE_INFO[movie]= MOVIEinfo(movie_id=movie,category=category)
                    Category_dict[category]=0
            for i,w in enumerate(Category_dict):
                Category_dict[w] = i
            with open('train_2/user.set') as user_file:
                for line in user_file:
                    uids= line.strip().split("\t")
                    user_set.add(uids[0])
            USER_DICT = dict()
            for i,w in enumerate(user_set):
                USER_DICT[w] = i
            ID_FEA_dict=dict()
            '''
            with open("train/id_fea_map") as id_file:
                for line in id_file:
                    ids=line.strip().split("\t")
                    ID_FEA_dict[ids[1]]=ids[0]
            PIC_ATTENT_DICT=dict()
            with open("train/pic_attent.txt") as pic_file:
                for line in pic_file:
                    picid,picword = line.strip().split("\t")
                    PIC_ATTENT_DICT[picword]=int(picid)
            '''
            ARTICAL_ATTENT_DICT=dict()
            with open("train_2/artical_attent.txt") as artical_file:
                for line in artical_file:
                    artical_id,artical_word = line.strip().split("\t")
                    ARTICAL_ATTENT_DICT[artical_word]=int(artical_id)       
def movie_size():
    __initialize_meta_info__()
    return len(MOVIE_INFO)

def user_size():
    __initialize_meta_info__()
    return len(USER_DICT)

def __max_index_info__(a, b):
    if a.movie_id > b.movie_id:
        return a
    else:
        return b
def max_movie_id():
    __initialize_meta_info__()
    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).movie_id

def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    fn = __initialize_meta_info__()
    rand = random.Random(x=rand_seed)
    with open('train_2/train_user_feature_2.set') as rating:
        for line in rating:
            if (rand.random() < test_ratio) == is_test:
                #t1 = time.time()
                pattern = re.compile(r'\d+')
                #print "movielens start time:"+str(t1)
                tokens = line.strip().split("\t")
                uid = tokens[0]
                mov_id = tokens[1]
                rate = tokens[3]
                word_list = []
                for word_feature in tokens[4:]:
                    if word_feature=="":
                        continue
                    word_list.append(ARTICAL_ATTENT_DICT[word_feature])
                #uid,mov_id,times,rate = line.strip().split("\t")
                mov_id = int(pattern.search(mov_id).group())
                rate = int(rate)
                #print [USER_DICT[uid]]+[word_list]+MOVIE_INFO[mov_id].value()+[[rate]]             
                yield [USER_DICT[uid]]+[word_list]+MOVIE_INFO[mov_id].value()+[[rate]]
                #t2 = time.time()

def __reader_creator__(**kwargs):
    return lambda: __reader__(**kwargs)

def predict_test():
    __initialize_meta_info__()
    test_data=[]
    pattern = re.compile(r'\d+')
    with open('test_2.set') as test:
        for line in test:
            tokens =line.strip().split("\t")
            userid = tokens[0]
            movieid = tokens[1]
            word_list = []
            tmp=True
            for word_feature in tokens[2:]:
                if word_feature=="":
                    tmp=False
                    continue
                word_list.append(ARTICAL_ATTENT_DICT[word_feature])
            if tmp == False :
                word_list.append(0)
            mov_id=int(pattern.search(movieid).group())
            test_data.append([USER_DICT[userid]]+[word_list]+MOVIE_INFO[mov_id].value())
    return test_data
def predict_label():
    __initialize_meta_info__()
    test_data=[]
    with open('test_2.set') as test:
        for line in test:
            tokens =line.strip().split("\t")
            user_id = tokens[0]
            movie_id = tokens[1]
            test_data.append(userid+"\t"+movieid)
    print "len test_data:"+str(len(test_data))
    return test_data    

train = functools.partial(__reader_creator__,is_test=False)
test = functools.partial(__reader_creator__,is_test=True)

def id_size():
    __initialize_meta_info__()
    return len(ID_FEA_dict)
def convert(path):
    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")

def unittest():
    for train_count, _ in enumerate(train()()):
        pass
    print train_count

if __name__ == '__main__':
    unittest()
import paddle.v2 as paddle
import cPickle
import copy
from movielens import *
import time
import numpy as np
import random
fout = open("all.test.result","w")
def AUC(records):
    records = sorted(records, key = lambda x: x[1], reverse = True)
    auc = 0.0
    fp1, tp1, fp2, tp2 = 0.0, 0.0, 0.0, 0.0
    for record in records:
        fp2 += (1-record[0])   #noclick
        tp2 += record[0]   #click
        auc += (fp2 - fp1) * (tp2 + tp1)
        fp1, tp1 = fp2, tp2
    return auc / (2.0 * tp2 * fp2) if tp2 * fp2 > 0.0 else 0.5  

def get_mov_combined_features():
    #movie_title_dict = get_movie_title_dict()

    uid = paddle.layer.data(
        name='user_id',
        type=paddle.data_type.integer_value(
            user_size() + 1))
    usr_emb = paddle.layer.embedding(input=uid, size=32)

    mov_id = paddle.layer.data(
            name='movie_id',
            type=paddle.data_type.integer_value(
            max_movie_id()+ 1))
    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
    #mov_fc = paddle.layer.fc(input=mov_emb, size=32)

    mov_category = paddle.layer.data(
        name='category',
        type=paddle.data_type.integer_value(3))
    mov_cate_emb = paddle.layer.embedding(input=mov_category,size=32)
    #mov_categories_hidden = paddle.layer.fc(input=mov_cate_emb, size=32)
    '''
    mov_title_id = paddle.layer.data(
        name='movie_title',
        type=paddle.data_type.integer_value_sequence(id_size()+1))
    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
    #mov_title_conv = paddle.networks.sequence_conv_pool(
    #   input=mov_title_emb, hidden_size=32, context_len=3)
    mov_seq_pool = paddle.layer.pooling(input=mov_title_emb,pooling_type=paddle.pooling.Sum())

    mov_pic_attent_id = paddle.layer.data(
        name='movie_pic_attent',
        type=paddle.data_type.integer_value_sequence(73621))
    mov_pic_attent_emb = paddle.layer.embedding(input=mov_pic_attent_id, size=32)
    mov_pic_attent_pool = paddle.layer.pooling(input=mov_pic_attent_emb,pooling_type=paddle.pooling.Sum())

    mov_artical_attent_id = paddle.layer.data(
        name='movie_artical_attent',
        type=paddle.data_type.integer_value_sequence(186671))
    mov_artical_attent_emb = paddle.layer.embedding(input=mov_artical_attent_id, size=32)   
    mov_artical_attent_pool = paddle.layer.pooling(input=mov_artical_attent_emb,pooling_type=paddle.pooling.Sum())
    '''
    user_word_feature_id = paddle.layer.data(
        name='user_word_feature',
        type=paddle.data_type.integer_value_sequence(3233513))
    user_word_feature_emb = paddle.layer.embedding(input=user_word_feature_id, size=32) 
    user_word_feature_pool = paddle.layer.pooling(input=user_word_feature_emb,pooling_type=paddle.pooling.Sum())

    mov_combined_features = paddle.layer.fc(
        input=[usr_emb,user_word_feature_pool,mov_emb,mov_cate_emb],
        size=200,
        act=paddle.activation.Relu())
    return mov_combined_features
    #return [mov_emb,mov_cate_emb,mov_title_emb]

def main():
    paddle.init(use_gpu=False)
    #usr_combined_features = get_usr_combined_features()
    mov_combined_features = get_mov_combined_features()
    layer1 = paddle.layer.fc(input=mov_combined_features,size=200, act=paddle.activation.Relu())
    #layer2 = paddle.layer.fc(input=layer1,size=200, act=paddle.activation.Relu())  
    #layer3 = paddle.layer.fc(input=layer2,size=200, act=paddle.activation.Relu())
    #layer4 = paddle.layer.fc(input=layer3,size=200, act=paddle.activation.Relu())
    #layer5 = paddle.layer.fc(input=layer4,size=200, act=paddle.activation.Relu())
    #layer6 = paddle.layer.fc(input=layer5,size=200, act=paddle.activation.Relu())
    #layer7 = paddle.layer.fc(input=layer6,size=200, act=paddle.activation.Relu())
    output = paddle.layer.fc(input=layer1,size=1,act=paddle.activation.Sigmoid())

    #output = paddle.layer.fc(input=[usr_combined_features,mov_combined_features],size=1,act=paddle.activation.Sigmoid())
    #label_data = paddle.layer.data(name="score", type=paddle.data_type.integer_value(2))
    label_data = paddle.layer.data(name="score", type=paddle.data_type.dense_vector(1))
    print "haha"
    #cost = paddle.layer.cross_entropy_cost(
    #   input=output,
    #   label=label_data)
    cost = paddle.layer.multi_binary_label_cross_entropy_cost(
            input=output,
        label=label_data)

    #eval = paddle.evaluator.auc(
    #       input=output,
    #       label=label_data)

    parameters = paddle.parameters.create(cost)

    trainer = paddle.trainer.SGD(
        cost=cost,
    #   extra_layers=eval,
        parameters=parameters,
        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
    feeding = {
        'user_id': 0,
        'user_word_feature':1,
        'movie_id': 2,
        'category': 3
        #'score': 4
    }

    infer_reader=paddle.batch(
            paddle.reader.buffered(
                test(), size=8192),
            batch_size=25600)

    ''' 
    with open('params_pass_0_1500.tar', 'r') as f:
        parameters2 = paddle.parameters.Parameters.from_tar(f)
    records=[]
    for id,batch in enumerate(infer_reader()):
        if id > 10 :
            break
        labels=[]
        probs = paddle.infer(output_layer=output,
                parameters=parameters2,
                feeding=feeding,
                input=batch)
        for index,record in enumerate(batch):
            labels.append(record[-1])
        for index in xrange(len(probs)):
            records.append([labels[index][0],probs[index][0]])
    print "Test auc:%s" % (str(AUC(records)))

    pass_num=0
    while(pass_num<2):
        batch_num=0
        while(batch_num<7600):
            with open('params_pass_%d_%d.tar' % (pass_num,batch_num), 'r') as f:
                parameters2 = paddle.parameters.Parameters.from_tar(f)
            records=[]
            for id,batch in enumerate(infer_reader()):
                if id > 10 :
                    break
                labels=[]
                probs = paddle.infer(output_layer=output,
                        parameters=parameters2,
                        feeding=feeding,
                        input=batch)
                for index,record in enumerate(batch):
                    labels.append(record[-1])
                for index in xrange(len(probs)):
                    records.append([labels[index][0],probs[index][0]])
            print "params_pass_%d_%d,Test auc:%s" % (pass_num,batch_num,str(AUC(records)))
            batch_num+=500
        pass_num+=1
    '''
    #load parameter
    with open('params_pass_0_1500.tar', 'r') as f:
        parameters = paddle.parameters.Parameters.from_tar(f)
    probs = paddle.infer(output_layer=output,
            parameters=parameters,
            feeding=feeding,
            input=predict_test())
    print probs
    test_data = predict_label()
    for index in xrange(len(probs)):
        fout.write(test_data[index]+"\t"+str(probs[index][0])+"\n")
    fout.close()
if __name__ == '__main__':
    main()
JessieMeng commented 7 years ago

我在计算auc的时候是可以计算的 ,但是在预测时候却预测不了,而且把空值判断了也没行,还是报错,auc和预测的区别在于input不同报错

F0921 20:23:53.614132 22409 MemoryHandle.cpp:49] Check failed: size != 0  allocate 0 bytes
*** Check failure stack trace: ***
    @     0x7f8b2e4e3e6d  google::LogMessage::Fail()
    @     0x7f8b2e4e791c  google::LogMessage::SendToLog()
    @     0x7f8b2e4e3993  google::LogMessage::Flush()
    @     0x7f8b2e4e8e2e  google::LogMessageFatal::~LogMessageFatal()
    @     0x7f8b2e44fb39  paddle::CpuMemoryHandle::CpuMemoryHandle()
    @     0x7f8b2e4256be  paddle::CpuVectorT<>::CpuVectorT()
    @     0x7f8b2e42639a  paddle::VectorT<>::create()
    @     0x7f8b2e4cce02  IVector::create()
    @     0x7f8b2e1cef87  _wrap_IVector_create
    @           0x4a9e33  PyEval_EvalFrameEx
    @           0x4ad70d  PyEval_EvalCodeEx
    @           0x4aa88c  PyEval_EvalFrameEx
    @           0x4aa9a7  PyEval_EvalFrameEx
    @           0x4aa9a7  PyEval_EvalFrameEx
    @           0x4ad70d  PyEval_EvalCodeEx
    @           0x51c1f0  function_call
    @           0x4243c3  PyObject_Call
    @           0x427b3d  instancemethod_call
    @           0x4243c3  PyObject_Call
    @           0x4a79f6  PyEval_EvalFrameEx
    @           0x4ad70d  PyEval_EvalCodeEx
    @           0x4aa88c  PyEval_EvalFrameEx
    @           0x4ad70d  PyEval_EvalCodeEx
    @           0x51c1f0  function_call
    @           0x4243c3  PyObject_Call
    @           0x427b3d  instancemethod_call
    @           0x4243c3  PyObject_Call
    @           0x479685  slot_tp_call
    @           0x4243c3  PyObject_Call
    @           0x4a79f6  PyEval_EvalFrameEx
    @           0x519db3  gen_iternext
    @           0x4a69dc  PyEval_EvalFrameEx
Aborted (core dumped)
luotao1 commented 7 years ago

@JessieMeng 请贴出相关代码或疑问的部分,目前贴的相关配置,信息量太大,不利于定位问题。

Yancey1989 commented 6 years ago

Close the inactivate issue.