jhao104 / proxy_pool

Python ProxyPool for web spider
https://jhao104.github.io/proxy_pool/
MIT License
20.82k stars 5.05k forks source link

如何在scrapy框架调用ip池? #300

Open LeooLeoo opened 5 years ago

LeooLeoo commented 5 years ago

我用了楼主的ip池,可以成功跑起来,但是不知道如何在python scrapy框架进行调用。 我尝试了很多种办法,但是感觉每种方法都不行,现在的办法报的错误是: 2019-06-26 18:36:26 [scrapy.core.scraper] ERROR: Spider error processing <GET https://drugs.dxy.cn/category/1227.htm> (referer: None) Traceback (most recent call last): File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback yield next(it) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output for x in result: File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr> return (_set_referer(r) for r in result or ()) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr> return (r for r in result or () if _filter(r)) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "/Users/leeo/Desktop/Synyi/12.药品目录爬虫/丁香园药品助手/DXY_spider/DXY_spider/spiders/DXY.py", line 58, in parse yield response.follow(href, self.parse_drug_instruction) File "/Library/Python/2.7/site-packages/scrapy/http/response/text.py", line 157, in follow errback=errback File "/Library/Python/2.7/site-packages/scrapy/http/response/__init__.py", line 124, in follow raise ValueError("url can't be None") ValueError: url can't be None

跪求各位大佬指点迷津!!!

我的爬虫代码如下: `# -- coding: utf-8 -- import scrapy import requests

def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").content

def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) def getHtml(href):

....

retry_count = 5
proxy = get_proxy()
while retry_count > 0:
    try:
        html = requests.get(href, proxies={"http": "http://{}".format(proxy)})
        #request = request(url=href)
        #request.meta['proxy'] = proxy
        #yield request
        # 使用代理访问
        return html
    except Exception:
        retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None

your spider code

yield Request(meta={'proxy': "http://%s"%(random.choice(["IP:PORT", "IP:PORT"]))})

class DXYSpider(scrapy.Spider):

name = 'DXY'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}

#proxy = get_proxy()
#start_urls = getHtml("https://drugs.dxy.cn/category/1227.htm")
#allowed_domains = ['drugs.dxy.cn']
start_urls = ['https://drugs.dxy.cn/category/1227.htm']
#cookie_dict = {}

#requests.get(start_urls, proxies={"http": "http://{}".format(get_proxy)})

def parse(self, response):
    # follow links to drug instruction page
    for href in response.css('div.fl h3 a::attr(href)').extract():
        #href = "http:"+href
        href = getHtml(href)
        #print(href)
        #request(url=href)
        #request.meta['proxy'] = get_proxy()
        #yield response.follow(href, self.parse_drug_instruction)
        #yield request

    # follow pagination links
    for href in response.css('div.pagination span a::attr(href)').extract():
        href = "https://drugs.dxy.cn/category/1227.htm" +href
        href = getHtml(href)
        #print (href) 
        #yield href
        #yield response.follow(href, self.parse)

def parse_drug_instruction(self, response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    yield {
        'response_url': response.url,
        'drug_title': extract_with_css('.commend::text'),
        'drug_name': extract_with_css('dt span.fl::text'), #药品名称
        'drug_name_value': response.css('dl dd::text')[0].get().strip(),
        'drug_name_value_1': response.css('dl dd::text')[1].get().strip(),
        'drug_name_value_2': response.css('dl dd::text')[2].get().strip(),

        'ingredient': extract_with_css('dt span.fl#2::text'), #成分
        'ingredient_value': response.css('dl dd::text')[4].get().strip(),

        'idication': extract_with_css('dt span.fl#3::text'), #适应症
        'idication_value': response.css('dl dd::text')[5].get().strip(),

        'usage_dosage': extract_with_css('dt span.fl#4::text'), #用法用量
        'usage_dosage_value': response.css('dl dd::text')[6].get().strip(),

        'ADRs': extract_with_css('dt span.fl#14::text'), #不良反应
        'ADRs_value': response.css('dl dd::text')[7].get().strip(),

        'contraindication': extract_with_css('dt span.fl#12::text'), #禁忌症
        'contraindication_value': response.css('dl dd::text')[9].get().strip(),

        'precaution': extract_with_css('dt span.fl#13::text'), #注意事项
        'precaution_value': response.css('dl dd::text')[10].get().strip(),

        'interaction': extract_with_css('dt span.fl#6::text'), #药物相互作用
        'interaction_value': response.css('dl dd::text')[12].get().strip(),

        'approve': extract_with_css('dt span.fl#39::text'), #批准文号
        'approve_value': response.css('dl dd::text')[15].get().strip(),

        'OTC': extract_with_css('dt span.fl#47::text'), #是否OTC
        'OTC_value': response.css('dl dd::text')[16].get().strip(),

        'company': extract_with_css('dt span.fl#1::text'), #生产企业
        'company_value': response.css('dl dd::text')[17].get().strip(),

        'classification': extract_with_css('dt span.fl#7::text'), #药物分类
        'classification_value': response.css('dl dd::text')[18].get().strip(),
    }

`

dar1900 commented 5 years ago

兄die,加下我扣扣735731761,可以沟通一下~

dar1900 commented 5 years ago

我用了楼主的ip池,可以成功跑起来,但是不知道如何在python scrapy框架进行调用。 我尝试了很多种办法,但是感觉每种方法都不行,现在的办法报的错误是: 2019-06-26 18:36:26 [scrapy.core.scraper] ERROR: Spider error processing <GET https://drugs.dxy.cn/category/1227.htm> (referer: None) Traceback (most recent call last): File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback yield next(it) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output for x in result: File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr> return (_set_referer(r) for r in result or ()) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr> return (r for r in result or () if _filter(r)) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "/Users/leeo/Desktop/Synyi/12.药品目录爬虫/丁香园药品助手/DXY_spider/DXY_spider/spiders/DXY.py", line 58, in parse yield response.follow(href, self.parse_drug_instruction) File "/Library/Python/2.7/site-packages/scrapy/http/response/text.py", line 157, in follow errback=errback File "/Library/Python/2.7/site-packages/scrapy/http/response/__init__.py", line 124, in follow raise ValueError("url can't be None") ValueError: url can't be None

跪求各位大佬指点迷津!!!

我的爬虫代码如下: `# -- coding: utf-8 -- import scrapy import requests

def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").content.content)

def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) def getHtml(href):

....

retry_count = 5 proxy = get_proxy() while retry_count > 0: try: html = requests.get(href, proxies={"http": "http://{}".format(proxy)})

request = request(url=href)

request.meta['proxy'] = proxy

yield request

使用代理访问

return html except Exception: retry_count -= 1

出错5次, 删除代理池中代理

delete_proxy(proxy) return None

your spider code

yield Request(meta={'proxy': "http://%s"%(random.choice(["IP:PORT", "IP:PORT"]))})

class DXYSpider(scrapy.Spider):

name = 'DXY'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}

#proxy = get_proxy()
#start_urls = getHtml("https://drugs.dxy.cn/category/1227.htm")
#allowed_domains = ['drugs.dxy.cn']
start_urls = ['https://drugs.dxy.cn/category/1227.htm']
#cookie_dict = {}

#requests.get(start_urls, proxies={"http": "http://{}".format(get_proxy)})

def parse(self, response):
    # follow links to drug instruction page
    for href in response.css('div.fl h3 a::attr(href)').extract():
        #href = "http:"+href
        href = getHtml(href)
        #print(href)
        #request(url=href)
        #request.meta['proxy'] = get_proxy()
        #yield response.follow(href, self.parse_drug_instruction)
        #yield request

    # follow pagination links
    for href in response.css('div.pagination span a::attr(href)').extract():
        href = "https://drugs.dxy.cn/category/1227.htm" +href
        href = getHtml(href)
        #print (href) 
        #yield href
        #yield response.follow(href, self.parse)

def parse_drug_instruction(self, response):
    def extract_with_css(query):
        return response.css(query).get(default='').strip()

    yield {
        'response_url': response.url,
        'drug_title': extract_with_css('.commend::text'),
        'drug_name': extract_with_css('dt span.fl::text'), #药品名称
        'drug_name_value': response.css('dl dd::text')[0].get().strip(),
        'drug_name_value_1': response.css('dl dd::text')[1].get().strip(),
        'drug_name_value_2': response.css('dl dd::text')[2].get().strip(),

        'ingredient': extract_with_css('dt span.fl#2::text'), #成分
        'ingredient_value': response.css('dl dd::text')[4].get().strip(),

        'idication': extract_with_css('dt span.fl#3::text'), #适应症
        'idication_value': response.css('dl dd::text')[5].get().strip(),

        'usage_dosage': extract_with_css('dt span.fl#4::text'), #用法用量
        'usage_dosage_value': response.css('dl dd::text')[6].get().strip(),

        'ADRs': extract_with_css('dt span.fl#14::text'), #不良反应
        'ADRs_value': response.css('dl dd::text')[7].get().strip(),

        'contraindication': extract_with_css('dt span.fl#12::text'), #禁忌症
        'contraindication_value': response.css('dl dd::text')[9].get().strip(),

        'precaution': extract_with_css('dt span.fl#13::text'), #注意事项
        'precaution_value': response.css('dl dd::text')[10].get().strip(),

        'interaction': extract_with_css('dt span.fl#6::text'), #药物相互作用
        'interaction_value': response.css('dl dd::text')[12].get().strip(),

        'approve': extract_with_css('dt span.fl#39::text'), #批准文号
        'approve_value': response.css('dl dd::text')[15].get().strip(),

        'OTC': extract_with_css('dt span.fl#47::text'), #是否OTC
        'OTC_value': response.css('dl dd::text')[16].get().strip(),

        'company': extract_with_css('dt span.fl#1::text'), #生产企业
        'company_value': response.css('dl dd::text')[17].get().strip(),

        'classification': extract_with_css('dt span.fl#7::text'), #药物分类
        'classification_value': response.css('dl dd::text')[18].get().strip(),
    }

` 兄die,加下我扣扣735731761,可以沟通一下~

shuurik commented 5 years ago

同样问题,小白等答案

jhao104 commented 4 years ago

这个项目只是收集代理,在scrapy里面怎么调用,参照框架提供的接口就行。

import base64 
# Start your middleware class
class ProxyMiddleware(object):
    # overwrite process request
    def process_request(self, request, spider):
        # Set the location of the proxy
        request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"

        # Use the following lines if your proxy requires authentication
        proxy_user_pass = "USERNAME:PASSWORD"
        # setup basic authentication for the proxy
        encoded_user_pass = base64.encodestring(proxy_user_pass)
        request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass

可以参考 https://my.oschina.net/jhao104/blog/639745

Guo-Hongfu commented 4 years ago

class RandomUserAgentMiddleware(object): logger = logging.getLogger(name) def init(self, crawler): super(RandomUserAgentMiddleware, self).init()

    self.ua = UserAgent()
    self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')

@classmethod
def from_crawler(cls, crawler):
    return cls(crawler)

def process_request(self, request, spider):
    def get_ua():
        return getattr(self.ua, self.ua_type)
    request.headers.setdefault('User-Agent', get_ua())
    request.meta['proxy'] = 'http://' + self.proxy()

def proxy(self):
    proxy = requests.get("http://127.0.0.1:5010/get").text
    try:
        print('get proxy ...')
        # proxy = requests.get("http://127.0.0.1:5010/get").text
        ip = {"http": "http://" + proxy, "https": "https://" + proxy}
        r = requests.get("http://www.baidu.com", proxies=ip, timeout=4)
        print(r.status_code)
        if r.status_code == 200:
            return proxy
    except:
        print('get proxy again ...')
        self.delete_proxy(proxy)
        return self.proxy()

def process_response(self, request, response, spider):
    '''对返回的response处理'''
    # 如果返回的response状态不是200,重新生成当前request对象
    if response.status != 200:
        print("again response ip:")
        # 对当前reque加上代理
        request.meta['proxy'] = 'http://' + self.proxy()
        return request
    return response

def process_exception(self, request, exception, spider):
    self.logger.debug('Get exception')
    request.meta['proxy'] = 'http://' + self.proxy()
    return request

def delete_proxy(self, proxy):
    requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))