Closed Kiris-tingna closed 6 years ago
you can report your main python file
import spider
import logging
from crawler_jiandan.jd_parser import JdParser, JdSaver
logging.basicConfig(level=logging.WARNING, format="%(asctime)s\t%(levelname)s\t%(message)s")
# three base components
fetcher = spider.Fetcher(max_repeat=3, sleep_time=1)
parser = JdParser(max_deep=20) # 爬取解析深度
saver = JdSaver(save_pipe=open("jd_spider.csv", "w", encoding='utf-8'))
# define url_filter capacity=None will use set and a number will use redis
black_patterns = (spider.CONFIG_URLPATTERN_FILES,) # 黑名单
white_patterns = (r"^http[s]{0,1}://(jandan\.){0,1}(net)",) # 白名单
url_filter = spider.UrlFilter(
black_patterns=black_patterns,
white_patterns=white_patterns,
capacity=None)
# initial web_spider
web_spider = spider.Spider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5)
# add start url
web_spider.set_start_url("http://jandan.net/pic/", keys=("jiandan",))
result = web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True)
from spider import Parser, Saver
import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from spider import make_fetch_images
import os
def add_default_scheme(url):
if urlparse(url).scheme == 'http' or urlparse(url).scheme == 'https':
return url
else:
return 'http:{url}'.format(url=url)
class JdParser(Parser):
def htm_parse(self, priority: int, url: str, keys: object, deep: int, content: object) -> (int, list, list):
"""
parse the content of a url, you can rewrite this function, parameters and return refer to self.working()
"""
status_code, url_now, html_text = content
_h = BeautifulSoup(html_text, "html.parser")
url_list = []
# produce more page
if (self._max_deep < 0) or (deep < self._max_deep):
next_page_list = _h.find_all('a', 'previous-comment-page')
url_list = [(_url, keys, priority+1) for _url in [add_default_scheme(href.get('href')) for href in next_page_list]]
# print(url_list)
images = _h.find_all('a', 'view_img_link') # images need save
save_list = [(add_default_scheme(_image.get('href')), datetime.datetime.now()) for _image in images] if images else []
# print(save_list)
return 1, url_list, save_list
class JdSaver(Saver):
def item_save(self, url: str, keys: object, item: (list, tuple)) -> bool:
"""
save the item of a url, you can rewrite this function, parameters and return refer to self.working()
"""
self._save_pipe.write(",".join([url, str(keys)] + [str(i) for i in item]) + "\n")
self._save_pipe.flush()
# save item
name, rep = make_fetch_images(item[0])
try:
with open(os.path.join('./images/', name), 'wb') as f:
for chunk in rep.iter_content(chunk_size=1024):
f.write(chunk)
except Exception:
return False
return True
首先: web_spider = spider.Spider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) spider模块没有这个Spider类吧? 其次: 我运行了一下你的代码,只是把JdSaver中后半部分删除了,运行正常,并没有出现你说的情况。 你再调试一下试试?
你可以把日志级别调整为DEBUG,看一下详细的日志输出
I solved this problem by put them if __name__ == '__main__':
It is a strange problem.
I stop it when I found it rerun twice