报错信息:
请输入要拽取评论视频的av号
8037147
Traceback (most recent call last):
File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 212, in
allComment = GetAllComment(videoaid)
File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 202, in GetAllComment
while cl.comments[len(cl.comments) - 1].lv == None:
IndexError: list index out of range
@author: Vespa
"""
import urllib2
import urllib
import re
import json
import zlib
import gzip
import xml.dom.minidom
import hashlib
import time
import sys
import os
import codecs
import workerpool
class User():
def init(self,m_mid=None,m_name=None):
if m_mid:
self.mid = m_mid
if m_name:
if isinstance(m_name,unicode):
m_name = m_name.encode('utf8')
self.name = m_name
本来以为是评论的多少问题,但是试了很少的评论好像也有问题诶。
报错信息: 请输入要拽取评论视频的av号 8037147 Traceback (most recent call last): File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 212, in
allComment = GetAllComment(videoaid)
File "/Users/XieHong/Google Drive/Python/Bili_Com_Grab/spitcomments.py", line 202, in GetAllComment
while cl.comments[len(cl.comments) - 1].lv == None:
IndexError: list index out of range
源代码在这里:
-- coding: utf-8 --
SPIT_LV = True SPIT_NAME = False SPIT_MID = True SPIT_MSG = True
-- coding: utf-8 --
""" Created on Mon May 26 23:59:09 2014
@author: Vespa """ import urllib2 import urllib import re import json import zlib import gzip import xml.dom.minidom import hashlib import time import sys import os import codecs import workerpool
class User(): def init(self,m_mid=None,m_name=None): if m_mid: self.mid = m_mid if m_name: if isinstance(m_name,unicode): m_name = m_name.encode('utf8') self.name = m_name
获取空间地址
class Comment(): def init(self): self.post_user = User() lv = None#楼层 fbid = None#评论id msg = None ad_check = None#状态 (0: 正常 1: UP主隐藏 2: 管理员删除 3: 因举报删除) post_user = None
class CommentList(): def init(self): pass comments = None commentLen = None page = None
class JsonInfo(): def init(self,url): self.info = json.loads(getURLContent(url)) while self.info.has_key('code') and self.info['code'] != 0: time.sleep(0.01) self.info = json.loads(getURLContent(url)) print 'Entered!' if self.info.has_key('message'): print "【Error】code=%d, msg=%s, url=%s"%(self.info['code'],self.Getvalue('message'),url) elif self.info.has_key('error'): print "【Error】code=%d, msg=%s, url=%s"%(self.info['code'],self.Getvalue('error'),url) error = True error = False def Getvalue(self,*keys): if len(keys) == 0: return None if self.info.has_key(keys[0]): temp = self.info[keys[0]] else: return None if len(keys) > 1: for key in keys[1:]: if temp.has_key(key): temp = temp[key] else: return None if isinstance(temp,unicode): temp = temp.encode('utf8') return temp info = None error = False
def getURLContent(url): while True: flag = 1 try: headers = {'User-Agent':'Mozilla/5.0 (Windows U Windows NT 6.1 en-US rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req = urllib2.Request(url = url,headers = headers) content = urllib2.urlopen(req).read() except: flag = 0 if flag == 1: break return content
def GetString(t): if type(t) == int: return str(t) return t
def GetComment(aid, page = None, pagesize = None, order = None): """ 输入: aid:AV号 page:页码 pagesize:单页返回的记录条数,最大不超过300,默认为10。 order:排序方式 默认按发布时间倒序 可选:good 按点赞人数排序 hot 按热门回复排序 返回: 评论列表 """ url = 'http://api.bilibili.cn/feedback?aid='+GetString(aid) if page: url += '&page='+GetString(page) if pagesize: url += '&pagesize='+GetString(pagesize) if order: url += '&order='+GetString(order) jsoninfo = JsonInfo(url) commentList = CommentList() commentList.comments = [Comment()] * pagesize commentList.commentLen = jsoninfo.Getvalue('totalResult') commentList.page = jsoninfo.Getvalue('pages') idx = 0 while jsoninfo.Getvalue(str(idx)): liuyan = Comment() liuyan.lv = jsoninfo.Getvalue(str(idx),'lv') liuyan.fbid = jsoninfo.Getvalue(str(idx),'fbid') liuyan.msg = jsoninfo.Getvalue(str(idx),'msg') liuyan.ad_check = jsoninfo.Getvalue(str(idx),'ad_check')
liuyan.post_user = GetUserInfoBymid(jsoninfo.Getvalue(str(idx),'mid'))
def GetAllComment(aid, order = None): """ 获取一个视频全部评论,有可能需要多次爬取,所以会有较大耗时 输入: aid:AV号 order:排序方式 默认按发布时间倒序 可选:good 按点赞人数排序 hot 按热门回复排序 返回: 评论列表 """ MaxPageSize = 300 commentLists = [GetComment(aid = aid, page = 1, pagesize = MaxPageSize, order = order)] totalPage = commentLists[0].page directory = 'av' + str(aid) + 'Comments' if not os.path.exists(directory): os.makedirs(directory) if totalPage > 1:
urls = ['http://api.bilibili.cn/feedback?aid=' + str(aid) + '&page=' + str(p) + '&pagesize=' + str(MaxPageSize) for p in range(2, commentList.page + 1)]
def GetCommentListKey(commentList): return commentList.comments[0].lv if name == "main": print u'请输入要拽取评论视频的av号' videoaid = input() commentTxt = codecs.open('av' + str(videoaid) + 'comments.txt', encoding = 'utf-8', mode = 'w') allComment = GetAllComment(videoaid) x = 0
commentTxtList = [u''] * len(allComment.comments)