Open xiaoyinguo22 opened 2 years ago
我也是之前可以使用,昨天开始出现这样错误
我也一样
我也同样出现该问题,我在其他网站上搜索出现 AttributeError: 'NoneType' object has no attribute 'split'的情况,发现其他人说只是有报错提示但是不影响数据的爬取。 但我感觉还是少爬了一部分数据…不知道如何解决,同等回复
我最近无法调试,不清楚是网站改版了,还是被限制了,大家再搜索看。
刚刚看了其他人的反馈 发现把search.py里面的p[@Class="from"全部更换为div[@Class="from",可以跑起来
确实可以,但请问原理是什么?
另外全部改成div会在遇见转发微博的时候报错,转发微博的元素不是div而是p,在retweet相关的一些p不能改成div
是的我试过了,没改retweet那部分可以运行,但是搞不清楚理由。
同问
File "/opt/conda/envs/python35-paddle120-env/bin/scrapy", line 8, in <module>
sys.exit(execute())
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/cmdline.py", line 123, in execute
settings = get_project_settings()
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/utils/project.py", line 68, in get_project_settings
settings.setmodule(settings_module_path, priority='project')
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 290, in setmodule
self.set(key, getattr(module, key), priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 265, in set
self.attributes[name].set(value, priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 50, in set
value = BaseSettings(value, priority=priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 86, in __init__
self.update(values, priority)
File "/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/scrapy/settings/__init__.py", line 322, in update
for name, value in values.items():
AttributeError: 'set' object has no attribute 'items'
改了之后新问题
改了上面两位老哥说的,成功跑起来了,谢谢几位老哥!
import os import re import sys from datetime import datetime, timedelta from urllib.parse import unquote
import scrapy import weibo.utils.util as util from scrapy.exceptions import CloseSpider from scrapy.utils.project import get_project_settings from weibo.items import WeiboItem
class SearchSpider(scrapy.Spider): name = 'search' allowed_domains = ['weibo.com'] settings = get_project_settings() keyword_list = settings.get('KEYWORD_LIST') if not isinstance(keyword_list, list): if not os.path.isabs(keyword_list): keyword_list = os.getcwd() + os.sep + keyword_list if not os.path.isfile(keyword_list): sys.exit('不存在%s文件' % keyword_list) keyword_list = util.get_keyword_list(keyword_list)
for i, keyword in enumerate(keyword_list):
if len(keyword) > 2 and keyword[0] == '#' and keyword[-1] == '#':
keyword_list[i] = '%23' + keyword[1:-1] + '%23'
weibo_type = util.convert_weibo_type(settings.get('WEIBO_TYPE'))
contain_type = util.convert_contain_type(settings.get('CONTAIN_TYPE'))
regions = util.get_regions(settings.get('REGION'))
base_url = 'https://s.weibo.com'
start_date = settings.get('START_DATE',
datetime.now().strftime('%Y-%m-%d'))
end_date = settings.get('END_DATE', datetime.now().strftime('%Y-%m-%d'))
if util.str_to_time(start_date) > util.str_to_time(end_date):
sys.exit('settings.py配置错误,START_DATE值应早于或等于END_DATE值,请重新配置settings.py')
further_threshold = settings.get('FURTHER_THRESHOLD', 46)
mongo_error = False
pymongo_error = False
mysql_error = False
pymysql_error = False
def start_requests(self):
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date,
'%Y-%m-%d') + timedelta(days=1)
start_str = start_date.strftime('%Y-%m-%d') + '-0'
end_str = end_date.strftime('%Y-%m-%d') + '-0'
for keyword in self.keyword_list:
if not self.settings.get('REGION') or '全部' in self.settings.get(
'REGION'):
base_url = 'https://s.weibo.com/weibo?q=%s' % keyword
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword
})
else:
for region in self.regions.values():
base_url = (
'https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}'.format(start_str, end_str)
# 获取一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse,
meta={
'base_url': base_url,
'keyword': keyword,
'province': region
})
def check_environment(self):
"""判断配置要求的软件是否已安装"""
if self.pymongo_error:
print('系统中可能没有安装pymongo库,请先运行 pip install pymongo ,再运行程序')
raise CloseSpider()
if self.mongo_error:
print('系统中可能没有安装或启动MongoDB数据库,请先根据系统环境安装或启动MongoDB,再运行程序')
raise CloseSpider()
if self.pymysql_error:
print('系统中可能没有安装pymysql库,请先运行 pip install pymysql ,再运行程序')
raise CloseSpider()
if self.mysql_error:
print('系统中可能没有安装或正确配置MySQL数据库,请先根据系统环境安装或配置MySQL,再运行程序')
raise CloseSpider()
def parse(self, response):
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date = datetime.strptime(self.start_date, '%Y-%m-%d')
end_date = datetime.strptime(self.end_date, '%Y-%m-%d')
while start_date <= end_date:
start_str = start_date.strftime('%Y-%m-%d') + '-0'
start_date = start_date + timedelta(days=1)
end_str = start_date.strftime('%Y-%m-%d') + '-0'
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一天的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_day,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'date': start_str[:-2]
})
def parse_by_day(self, response):
"""以天为单位筛选"""
base_url = response.meta.get('base_url')
keyword = response.meta.get('keyword')
province = response.meta.get('province')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
date = response.meta.get('date')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
start_date_str = date + '-0'
start_date = datetime.strptime(start_date_str, '%Y-%m-%d-%H')
for i in range(1, 25):
start_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
start_date = start_date + timedelta(hours=1)
end_str = start_date.strftime('%Y-%m-%d-X%H').replace(
'X0', 'X').replace('X', '')
url = base_url + self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_str, end_str)
# 获取一小时的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province
if province else self.parse_by_hour,
meta={
'base_url': base_url,
'keyword': keyword,
'province': province,
'start_time': start_str,
'end_time': end_str
})
def parse_by_hour(self, response):
"""以小时为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
for region in self.regions.values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:1000'
).format(keyword, region['code'])
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个省的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_by_hour_province,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': region
})
def parse_by_hour_province(self, response):
"""以小时和直辖市/省为单位筛选"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
start_time = response.meta.get('start_time')
end_time = response.meta.get('end_time')
province = response.meta.get('province')
page_count = len(response.xpath('//ul[@class="s-scroll"]/li'))
if is_empty:
print('当前页面搜索结果为空')
elif page_count < self.further_threshold:
# 解析当前页面
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
else:
for city in province['city'].values():
url = ('https://s.weibo.com/weibo?q={}®ion=custom:{}:{}'
).format(keyword, province['code'], city)
url += self.weibo_type
url += self.contain_type
url += '×cope=custom:{}:{}&page=1'.format(
start_time, end_time)
# 获取一小时一个城市的搜索结果
yield scrapy.Request(url=url,
callback=self.parse_page,
meta={
'keyword': keyword,
'start_time': start_time,
'end_time': end_time,
'province': province,
'city': city
})
def parse_page(self, response):
"""解析一页搜索结果的信息"""
keyword = response.meta.get('keyword')
is_empty = response.xpath(
'//div[@class="card card-no-result s-pt20b40"]')
if is_empty:
print('当前页面搜索结果为空')
else:
for weibo in self.parse_weibo(response):
self.check_environment()
yield weibo
next_url = response.xpath(
'//a[@class="next"]/@href').extract_first()
if next_url:
next_url = self.base_url + next_url
yield scrapy.Request(url=next_url,
callback=self.parse_page,
meta={'keyword': keyword})
def get_article_url(self, selector):
"""获取微博头条文章url"""
article_url = ''
text = selector.xpath('string(.)').extract_first().replace(
'\u200b', '').replace('\ue627', '').replace('\n',
'').replace(' ', '')
if text.startswith('发布了头条文章'):
urls = selector.xpath('.//a')
for url in urls:
if url.xpath(
'i[@class="wbicon"]/text()').extract_first() == 'O':
if url.xpath('@href').extract_first() and url.xpath(
'@href').extract_first().startswith('http://t.cn'):
article_url = url.xpath('@href').extract_first()
break
return article_url
def get_location(self, selector):
"""获取微博发布位置"""
a_list = selector.xpath('.//a')
location = ''
for a in a_list:
if a.xpath('./i[@class="wbicon"]') and a.xpath(
'./i[@class="wbicon"]/text()').extract_first() == '2':
location = a.xpath('string(.)').extract_first()[1:]
break
return location
def get_at_users(self, selector):
"""获取微博中@的用户昵称"""
a_list = selector.xpath('.//a')
at_users = ''
at_list = []
for a in a_list:
if len(unquote(a.xpath('@href').extract_first())) > 14 and len(
a.xpath('string(.)').extract_first()) > 1:
if unquote(a.xpath('@href').extract_first())[14:] == a.xpath(
'string(.)').extract_first()[1:]:
at_user = a.xpath('string(.)').extract_first()[1:]
if at_user not in at_list:
at_list.append(at_user)
if at_list:
at_users = ','.join(at_list)
return at_users
def get_topics(self, selector):
"""获取参与的微博话题"""
a_list = selector.xpath('.//a')
topics = ''
topic_list = []
for a in a_list:
text = a.xpath('string(.)').extract_first()
if len(text) > 2 and text[0] == '#' and text[-1] == '#':
if text[1:-1] not in topic_list:
topic_list.append(text[1:-1])
if topic_list:
topics = ','.join(topic_list)
return topics
def parse_weibo(self, response):
"""解析网页中的微博信息"""
keyword = response.meta.get('keyword')
for sel in response.xpath("//div[@class='card-wrap']"):
info = sel.xpath(
"div[@class='card']/div[@class='card-feed']/div[@class='content']/div[@class='info']"
)
if info:
weibo = WeiboItem()
weibo['id'] = sel.xpath('@mid').extract_first()
weibo['bid'] = sel.xpath(
'.//div[@class="from"]/a[1]/@href').extract_first(
).split('/')[-1].split('?')[0]
weibo['user_id'] = info[0].xpath(
'div[2]/a/@href').extract_first().split('?')[0].split(
'/')[-1]
weibo['screen_name'] = info[0].xpath(
'div[2]/a/@nick-name').extract_first()
txt_sel = sel.xpath('.//p[@class="txt"]')[0]
retweet_sel = sel.xpath('.//div[@class="card-comment"]')
retweet_txt_sel = ''
if retweet_sel and retweet_sel[0].xpath('.//div[@class="txt"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//div[@class="txt"]')[0]
content_full = sel.xpath(
'.//p[@node-type="feed_list_content_full"]')
is_long_weibo = False
is_long_retweet = False
if content_full:
if not retweet_sel:
txt_sel = content_full[0]
is_long_weibo = True
elif len(content_full) == 2:
txt_sel = content_full[0]
retweet_txt_sel = content_full[1]
is_long_weibo = True
is_long_retweet = True
elif retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]'):
retweet_txt_sel = retweet_sel[0].xpath(
'.//p[@node-type="feed_list_content_full"]')[0]
is_long_retweet = True
else:
txt_sel = content_full[0]
is_long_weibo = True
weibo['text'] = txt_sel.xpath(
'string(.)').extract_first().replace('\u200b', '').replace(
'\ue627', '')
weibo['article_url'] = self.get_article_url(txt_sel)
weibo['location'] = self.get_location(txt_sel)
if weibo['location']:
weibo['text'] = weibo['text'].replace(
'2' + weibo['location'], '')
weibo['text'] = weibo['text'][2:].replace(' ', '')
if is_long_weibo:
weibo['text'] = weibo['text'][:-4]
weibo['at_users'] = self.get_at_users(txt_sel)
weibo['topics'] = self.get_topics(txt_sel)
reposts_count = sel.xpath(
'.//a[@action-type="feed_list_forward"]/text()').extract()
reposts_count = "".join(reposts_count)
try:
reposts_count = re.findall(r'\d+.*', reposts_count)
except TypeError:
print(
"无法解析转发按钮,可能是 1) 网页布局有改动 2) cookie无效或已过期。\n"
"请在 https://github.com/dataabc/weibo-search 查看文档,以解决问题,"
)
raise CloseSpider()
weibo['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = sel.xpath(
'.//a[@action-type="feed_list_comment"]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
weibo['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = sel.xpath(
'(.//span[@class="woo-like-count"])[last()]/text()').extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
weibo['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = sel.xpath(
'.//div[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
weibo['created_at'] = util.standardize_date(created_at)
source = sel.xpath('.//div[@class="from"]/a[2]/text()'
).extract_first()
weibo['source'] = source if source else ''
pics = ''
is_exist_pic = sel.xpath(
'.//div[@class="media media-piclist"]')
if is_exist_pic:
pics = is_exist_pic[0].xpath('ul[1]/li/img/@src').extract()
pics = [pic[8:] for pic in pics]
pics = [
re.sub(r'/.*?/', '/large/', pic, 1) for pic in pics
]
pics = ['https://' + pic for pic in pics]
video_url = ''
is_exist_video = sel.xpath(
'.//div[@class="thumbnail"]//video-player').extract_first()
if is_exist_video:
video_url = re.findall(r'src:\'(.*?)\'', is_exist_video)[0]
video_url = video_url.replace('&', '&')
video_url = 'http:' + video_url
if not retweet_sel:
weibo['pics'] = pics
weibo['video_url'] = video_url
else:
weibo['pics'] = ''
weibo['video_url'] = ''
weibo['retweet_id'] = ''
if retweet_sel and retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'):
retweet = WeiboItem()
retweet['id'] = retweet_sel[0].xpath(
'.//a[@action-type="feed_list_like"]/@action-data'
).extract_first()[4:]
retweet['bid'] = retweet_sel[0].xpath(
'.//p[@class="from"]/a/@href').extract_first().split(
'/')[-1].split('?')[0]
info = retweet_sel[0].xpath(
'.//div[@node-type="feed_list_forwardContent"]/a[1]'
)[0]
retweet['user_id'] = info.xpath(
'@href').extract_first().split('/')[-1]
retweet['screen_name'] = info.xpath(
'@nick-name').extract_first()
retweet['text'] = retweet_txt_sel.xpath(
'string(.)').extract_first().replace('\u200b',
'').replace(
'\ue627', '')
retweet['article_url'] = self.get_article_url(
retweet_txt_sel)
retweet['location'] = self.get_location(retweet_txt_sel)
if retweet['location']:
retweet['text'] = retweet['text'].replace(
'2' + retweet['location'], '')
retweet['text'] = retweet['text'][2:].replace(' ', '')
if is_long_retweet:
retweet['text'] = retweet['text'][:-4]
retweet['at_users'] = self.get_at_users(retweet_txt_sel)
retweet['topics'] = self.get_topics(retweet_txt_sel)
reposts_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[1]/a[1]/text()'
).extract_first()
reposts_count = re.findall(r'\d+.*', reposts_count)
retweet['reposts_count'] = reposts_count[
0] if reposts_count else '0'
comments_count = retweet_sel[0].xpath(
'.//ul[@class="act s-fr"]/li[2]/a[1]/text()'
).extract_first()
comments_count = re.findall(r'\d+.*', comments_count)
retweet['comments_count'] = comments_count[
0] if comments_count else '0'
attitudes_count = retweet_sel[0].xpath(
'.//a[@class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"]//span[@class="woo-like-count"]/text()'
).extract_first()
attitudes_count = re.findall(r'\d+.*', attitudes_count)
retweet['attitudes_count'] = attitudes_count[
0] if attitudes_count else '0'
created_at = retweet_sel[0].xpath(
'.//p[@class="from"]/a[1]/text()').extract_first(
).replace(' ', '').replace('\n', '').split('前')[0]
retweet['created_at'] = util.standardize_date(created_at)
source = retweet_sel[0].xpath(
'.//p[@class="from"]/a[2]/text()').extract_first()
retweet['source'] = source if source else ''
retweet['pics'] = pics
retweet['video_url'] = video_url
retweet['retweet_id'] = ''
yield {'weibo': retweet, 'keyword': keyword}
weibo['retweet_id'] = retweet['id']
print(weibo)
yield {'weibo': weibo, 'keyword': keyword}
我也遇到相同问题。改了之后可以正常运行,但是信息不全。同样的设置,之前能存下来的大概是现在的2-3倍。不知道大家有没有遇到类似的问题。
我就一直循环楼主的这个界面,然后最后停止后什么也没有,结果文件夹里也没有。不知道为啥
切换成旧版微博的cookie就可以解决这个问题,不需要修改代码。 我也遇到同样的问题,如果使用新版web微博的cookie就会出现这个问题😂
切换成旧版微博的cookie就可以解决这个问题,不需要修改代码。 我也遇到同样的问题,如果使用新版web微博的cookie就会出现这个问题😂
你好,可以发一下你用的旧版微博网站吗
我使用这个网站的cookie是可以运行的 https://weibo.cn/
环境都配置好了,之前运行都有用,今天再使用的时候出现了这个错误
Traceback (most recent call last): File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\defer.py", line 132, in iter_errback yield next(it) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\python.py", line 354, in __next__ return next(self.data) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\utils\python.py", line 354, in __next__ return next(self.data) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output for x in result: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 342, in <genexpr> return (_set_referer(r) for r in result or ()) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 40, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "F:\Anaconda\envs\weibo-spider\lib\site-packages\scrapy\core\spidermw.py", line 66, in _evaluate_iterable for r in iterable: File "F:\python_code\weibo\weibo-search\weibo\spiders\search.py", line 107, in parse for weibo in self.parse_weibo(response): File "F:\python_code\weibo\weibo-search\weibo\spiders\search.py", line 356, in parse_weibo weibo['bid'] = sel.xpath( AttributeError: 'NoneType' object has no attribute 'split'