jhao104 / proxy_pool

Python ProxyPool for web spider
https://jhao104.github.io/proxy_pool/
MIT License
21.48k stars 5.17k forks source link

在Scrapy中使用的方法 #304

Open Guo-Hongfu opened 5 years ago

Guo-Hongfu commented 5 years ago

设置一个中间件 DOWNLOADER_MIDDLEWARES = { 'Article.middlewares.RandomUserAgentMiddleware': 543, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, }

class RandomUserAgentMiddleware(object):
    logger = logging.getLogger(__name__)
    def __init__(self, crawler):
        super(RandomUserAgentMiddleware, self).__init__()

        self.ua = UserAgent()
        self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)
        request.headers.setdefault('User-Agent', get_ua())
        request.meta['proxy'] = 'http://' + self.proxy()

    def proxy(self):
        proxy = requests.get("http://127.0.0.1:5010/get").text
        try:
            print('get proxy ...')
            # proxy = requests.get("http://127.0.0.1:5010/get").text
            ip = {"http": "http://" + proxy, "https": "https://" + proxy}
            r = requests.get("http://www.baidu.com", proxies=ip, timeout=4)
            print(r.status_code)
            if r.status_code == 200:
                return proxy
        except:
            print('get proxy again ...')
            self.delete_proxy(proxy)
            return self.proxy()

    def process_response(self, request, response, spider):
        '''对返回的response处理'''
        # 如果返回的response状态不是200,重新生成当前request对象
        if response.status != 200:
            print("again response ip:")
            # 对当前reque加上代理
            request.meta['proxy'] = 'http://' + self.proxy()
            return request
        return response

    def process_exception(self, request, exception, spider):
        self.logger.debug('Get exception')
        request.meta['proxy'] = 'http://' + self.proxy()
        return request

    def delete_proxy(self, proxy):
        requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
YunCan-code commented 3 years ago

现在请求返回的是JSON格式,注意需要将proxy = requests.get("http://127.0.0.1:5010/get").text改为proxy = requests.get("http://127.0.0.1:5010/get").json().get("proxy")

chinobing commented 2 years ago

怎样知道是否已经运行了? 我运行之后只显示RandomUserAgentMiddleware, 但没有其他proxy信息。btw,已改为proxy = requests.get("http://127.0.0.1:5010/get").json().get("proxy")