Vespa314 / bilibili-api

B站API收集整理及开发,不再维护
MIT License
1.34k stars 193 forks source link

评论抓取中:IndexError: list index out of range #17

Open TTstone opened 7 years ago

TTstone commented 7 years ago

本来以为是评论的多少问题,但是试了很少的评论好像也有问题诶。

报错信息: 请输入要拽取评论视频的av号 8037147 Traceback (most recent call last): File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 212, in allComment = GetAllComment(videoaid) File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 202, in GetAllComment while cl.comments[len(cl.comments) - 1].lv == None: IndexError: list index out of range

源代码在这里:

-- coding: utf-8 --

SPIT_LV = True SPIT_NAME = False SPIT_MID = True SPIT_MSG = True

-- coding: utf-8 --

""" Created on Mon May 26 23:59:09 2014

@author: Vespa """ import urllib2 import urllib import re import json import zlib import gzip import xml.dom.minidom import hashlib import time import sys import os import codecs import workerpool

class User(): def init(self,m_mid=None,m_name=None): if m_mid: self.mid = m_mid if m_name: if isinstance(m_name,unicode): m_name = m_name.encode('utf8') self.name = m_name

获取空间地址

def GetSpace(self):
    return 'http://space.bilibili.tv/'+str(self.mid)
mid = None
name = None
isApprove = None#是否是认证账号
spaceName = None
sex = None
rank = None
avatar = None
follow = None#关注好友数目
fans = None#粉丝数目
article = None#投稿数
place = None#所在地
description = None#认证用户为认证信息 普通用户为交友宣言
followlist = None#关注的好友列表
friend = None
DisplayRank = None

class Comment(): def init(self): self.post_user = User() lv = None#楼层 fbid = None#评论id msg = None ad_check = None#状态 (0: 正常 1: UP主隐藏 2: 管理员删除 3: 因举报删除) post_user = None

class CommentList(): def init(self): pass comments = None commentLen = None page = None

class JsonInfo(): def init(self,url): self.info = json.loads(getURLContent(url)) while self.info.has_key('code') and self.info['code'] != 0: time.sleep(0.01) self.info = json.loads(getURLContent(url)) print 'Entered!' if self.info.has_key('message'): print "【Error】code=%d, msg=%s, url=%s"%(self.info['code'],self.Getvalue('message'),url) elif self.info.has_key('error'): print "【Error】code=%d, msg=%s, url=%s"%(self.info['code'],self.Getvalue('error'),url) error = True error = False def Getvalue(self,*keys): if len(keys) == 0: return None if self.info.has_key(keys[0]): temp = self.info[keys[0]] else: return None if len(keys) > 1: for key in keys[1:]: if temp.has_key(key): temp = temp[key] else: return None if isinstance(temp,unicode): temp = temp.encode('utf8') return temp info = None error = False

def getURLContent(url): while True: flag = 1 try: headers = {'User-Agent':'Mozilla/5.0 (Windows U Windows NT 6.1 en-US rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req = urllib2.Request(url = url,headers = headers) content = urllib2.urlopen(req).read() except: flag = 0 if flag == 1: break return content

def GetString(t): if type(t) == int: return str(t) return t

def GetComment(aid, page = None, pagesize = None, order = None): """ 输入: aid:AV号 page:页码 pagesize:单页返回的记录条数,最大不超过300,默认为10。 order:排序方式 默认按发布时间倒序 可选:good 按点赞人数排序 hot 按热门回复排序 返回: 评论列表 """ url = 'http://api.bilibili.cn/feedback?aid='+GetString(aid) if page: url += '&page='+GetString(page) if pagesize: url += '&pagesize='+GetString(pagesize) if order: url += '&order='+GetString(order) jsoninfo = JsonInfo(url) commentList = CommentList() commentList.comments = [Comment()] * pagesize commentList.commentLen = jsoninfo.Getvalue('totalResult') commentList.page = jsoninfo.Getvalue('pages') idx = 0 while jsoninfo.Getvalue(str(idx)): liuyan = Comment() liuyan.lv = jsoninfo.Getvalue(str(idx),'lv') liuyan.fbid = jsoninfo.Getvalue(str(idx),'fbid') liuyan.msg = jsoninfo.Getvalue(str(idx),'msg') liuyan.ad_check = jsoninfo.Getvalue(str(idx),'ad_check')

liuyan.post_user = GetUserInfoBymid(jsoninfo.Getvalue(str(idx),'mid'))

    liuyan.post_user.mid = jsoninfo.Getvalue(str(idx),'mid')
    '''
    liuyan.post_user.avatar = jsoninfo.Getvalue(str(idx),'face')
    liuyan.post_user.rank = jsoninfo.Getvalue(str(idx),'rank')
    liuyan.post_user.name = jsoninfo.Getvalue(str(idx),'nick')'''
    commentList.comments[idx] = liuyan
    idx += 1
return commentList

def GetAllComment(aid, order = None): """ 获取一个视频全部评论,有可能需要多次爬取,所以会有较大耗时 输入: aid:AV号 order:排序方式 默认按发布时间倒序 可选:good 按点赞人数排序 hot 按热门回复排序 返回: 评论列表 """ MaxPageSize = 300 commentLists = [GetComment(aid = aid, page = 1, pagesize = MaxPageSize, order = order)] totalPage = commentLists[0].page directory = 'av' + str(aid) + 'Comments' if not os.path.exists(directory): os.makedirs(directory) if totalPage > 1:

urls = ['http://api.bilibili.cn/feedback?aid=' + str(aid) + '&page=' + str(p) + '&pagesize=' + str(MaxPageSize) for p in range(2, commentList.page + 1)]

    # Make a pool
    #pool = workerpool.WorkerPool(size = 10)
    pool = workerpool.WorkerPool(size = totalPage - 1)
    # Build our `map` parameters
    #saveto = [directory + '/' + str(x) for x in range(2, commentList.page + 1)]
    # Perform the mapping
    #pool.map(urllib.urlretrieve, urls, saveto)
    commentLists2 = pool.map(GetComment, [aid] * (totalPage - 1), range(2, totalPage + 1), [MaxPageSize] *  (totalPage - 1))
    # Send shutdown jobs to all threads, and wait until all the jobs have been completed
    pool.shutdown()
    pool.wait()
    for cl in commentLists2:
        commentLists.append(cl)
    '''
    commentList.comments += [Comment()] * (commentList.page - 1)
    for p in range(2, commentList.page + 1):
        commentPath = directory + '/' + str(p)
        commentList.comments[(p - 1) * MaxPageSize :] = GetCommentLocal(commentPath, MaxPageSize).comments
        os.remove(commentPath)
    os.rmdir(directory)
#经测试发现,如果视频评论涨幅过快(av2816940)
#那么JSON第一层的totalResult和pages可能不准
#即使每抓一页都重新读取totalResult也无济于事
while commentList.comments[len(commentList.comments) - 1].lv == None:
    commentList.comments.pop()
commentList.commentLen = len(commentList.comments)'''
for cl in commentLists:
    while cl.comments[len(cl.comments) - 1].lv == None:
        cl.comments.pop()
commentLists = sorted(commentLists, key = GetCommentListKey, reverse = True)
return commentLists

def GetCommentListKey(commentList): return commentList.comments[0].lv if name == "main": print u'请输入要拽取评论视频的av号' videoaid = input() commentTxt = codecs.open('av' + str(videoaid) + 'comments.txt', encoding = 'utf-8', mode = 'w') allComment = GetAllComment(videoaid) x = 0

commentTxtList = [u''] * len(allComment.comments)

tempStrListSize = SPIT_LV + SPIT_NAME + SPIT_MID
for cl in allComment:
    for aComment in cl.comments:
#while x < len(allComment.comments):
    #aComment = allComment.comments[x]
        tempStrList = [u''] * tempStrListSize
        tempStr = u''
        i = 0
        if SPIT_LV:
            tempStrList[i] = unicode(str(aComment.lv)) + u'楼'
            i += 1
        if SPIT_NAME:
            tempStrList[i] = u'昵称: ' + unicode(str(aComment.post_user.name))
            i += 1
        if SPIT_MID:
            tempStrList[i] = u'UID: ' + unicode(str(aComment.post_user.mid))
            i += 1
        y = 0
        if tempStrListSize == 1:
            tempStr += tempStrList[0] + u'\n'
        if tempStrListSize > 1:
            while y < tempStrListSize - 1:
                tempStr += tempStrList[y] + u'\t'
                y += 1
            tempStr += tempStrList[y] + u'\n'
        if SPIT_MSG:
            tempStr += aComment.msg.decode('utf8')
            tempStr += u'\n'
        commentTxt.write(tempStr)
commentTxt.close()