Open LeooLeoo opened 5 years ago
兄die,加下我扣扣735731761,可以沟通一下~
我用了楼主的ip池,可以成功跑起来,但是不知道如何在python scrapy框架进行调用。 我尝试了很多种办法,但是感觉每种方法都不行,现在的办法报的错误是:
2019-06-26 18:36:26 [scrapy.core.scraper] ERROR: Spider error processing <GET https://drugs.dxy.cn/category/1227.htm> (referer: None) Traceback (most recent call last): File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback yield next(it) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output for x in result: File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr> return (_set_referer(r) for r in result or ()) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr> return (r for r in result or () if _filter(r)) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "/Users/leeo/Desktop/Synyi/12.药品目录爬虫/丁香园药品助手/DXY_spider/DXY_spider/spiders/DXY.py", line 58, in parse yield response.follow(href, self.parse_drug_instruction) File "/Library/Python/2.7/site-packages/scrapy/http/response/text.py", line 157, in follow errback=errback File "/Library/Python/2.7/site-packages/scrapy/http/response/__init__.py", line 124, in follow raise ValueError("url can't be None") ValueError: url can't be None
跪求各位大佬指点迷津!!!
我的爬虫代码如下: `# -- coding: utf-8 -- import scrapy import requests
def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").content.content)
def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) def getHtml(href):
....
retry_count = 5 proxy = get_proxy() while retry_count > 0: try: html = requests.get(href, proxies={"http": "http://{}".format(proxy)})
request = request(url=href)
request.meta['proxy'] = proxy
yield request
使用代理访问
return html except Exception: retry_count -= 1
出错5次, 删除代理池中代理
delete_proxy(proxy) return None
your spider code
yield Request(meta={'proxy': "http://%s"%(random.choice(["IP:PORT", "IP:PORT"]))})
class DXYSpider(scrapy.Spider):
name = 'DXY' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', } #proxy = get_proxy() #start_urls = getHtml("https://drugs.dxy.cn/category/1227.htm") #allowed_domains = ['drugs.dxy.cn'] start_urls = ['https://drugs.dxy.cn/category/1227.htm'] #cookie_dict = {} #requests.get(start_urls, proxies={"http": "http://{}".format(get_proxy)}) def parse(self, response): # follow links to drug instruction page for href in response.css('div.fl h3 a::attr(href)').extract(): #href = "http:"+href href = getHtml(href) #print(href) #request(url=href) #request.meta['proxy'] = get_proxy() #yield response.follow(href, self.parse_drug_instruction) #yield request # follow pagination links for href in response.css('div.pagination span a::attr(href)').extract(): href = "https://drugs.dxy.cn/category/1227.htm" +href href = getHtml(href) #print (href) #yield href #yield response.follow(href, self.parse) def parse_drug_instruction(self, response): def extract_with_css(query): return response.css(query).get(default='').strip() yield { 'response_url': response.url, 'drug_title': extract_with_css('.commend::text'), 'drug_name': extract_with_css('dt span.fl::text'), #药品名称 'drug_name_value': response.css('dl dd::text')[0].get().strip(), 'drug_name_value_1': response.css('dl dd::text')[1].get().strip(), 'drug_name_value_2': response.css('dl dd::text')[2].get().strip(), 'ingredient': extract_with_css('dt span.fl#2::text'), #成分 'ingredient_value': response.css('dl dd::text')[4].get().strip(), 'idication': extract_with_css('dt span.fl#3::text'), #适应症 'idication_value': response.css('dl dd::text')[5].get().strip(), 'usage_dosage': extract_with_css('dt span.fl#4::text'), #用法用量 'usage_dosage_value': response.css('dl dd::text')[6].get().strip(), 'ADRs': extract_with_css('dt span.fl#14::text'), #不良反应 'ADRs_value': response.css('dl dd::text')[7].get().strip(), 'contraindication': extract_with_css('dt span.fl#12::text'), #禁忌症 'contraindication_value': response.css('dl dd::text')[9].get().strip(), 'precaution': extract_with_css('dt span.fl#13::text'), #注意事项 'precaution_value': response.css('dl dd::text')[10].get().strip(), 'interaction': extract_with_css('dt span.fl#6::text'), #药物相互作用 'interaction_value': response.css('dl dd::text')[12].get().strip(), 'approve': extract_with_css('dt span.fl#39::text'), #批准文号 'approve_value': response.css('dl dd::text')[15].get().strip(), 'OTC': extract_with_css('dt span.fl#47::text'), #是否OTC 'OTC_value': response.css('dl dd::text')[16].get().strip(), 'company': extract_with_css('dt span.fl#1::text'), #生产企业 'company_value': response.css('dl dd::text')[17].get().strip(), 'classification': extract_with_css('dt span.fl#7::text'), #药物分类 'classification_value': response.css('dl dd::text')[18].get().strip(), }
` 兄die,加下我扣扣735731761,可以沟通一下~
同样问题,小白等答案
这个项目只是收集代理,在scrapy里面怎么调用,参照框架提供的接口就行。
import base64
# Start your middleware class
class ProxyMiddleware(object):
# overwrite process request
def process_request(self, request, spider):
# Set the location of the proxy
request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"
# Use the following lines if your proxy requires authentication
proxy_user_pass = "USERNAME:PASSWORD"
# setup basic authentication for the proxy
encoded_user_pass = base64.encodestring(proxy_user_pass)
request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
class RandomUserAgentMiddleware(object): logger = logging.getLogger(name) def init(self, crawler): super(RandomUserAgentMiddleware, self).init()
self.ua = UserAgent()
self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
request.meta['proxy'] = 'http://' + self.proxy()
def proxy(self):
proxy = requests.get("http://127.0.0.1:5010/get").text
try:
print('get proxy ...')
# proxy = requests.get("http://127.0.0.1:5010/get").text
ip = {"http": "http://" + proxy, "https": "https://" + proxy}
r = requests.get("http://www.baidu.com", proxies=ip, timeout=4)
print(r.status_code)
if r.status_code == 200:
return proxy
except:
print('get proxy again ...')
self.delete_proxy(proxy)
return self.proxy()
def process_response(self, request, response, spider):
'''对返回的response处理'''
# 如果返回的response状态不是200,重新生成当前request对象
if response.status != 200:
print("again response ip:")
# 对当前reque加上代理
request.meta['proxy'] = 'http://' + self.proxy()
return request
return response
def process_exception(self, request, exception, spider):
self.logger.debug('Get exception')
request.meta['proxy'] = 'http://' + self.proxy()
return request
def delete_proxy(self, proxy):
requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
我用了楼主的ip池,可以成功跑起来,但是不知道如何在python scrapy框架进行调用。 我尝试了很多种办法,但是感觉每种方法都不行,现在的办法报的错误是:
2019-06-26 18:36:26 [scrapy.core.scraper] ERROR: Spider error processing <GET https://drugs.dxy.cn/category/1227.htm> (referer: None) Traceback (most recent call last): File "/Library/Python/2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback yield next(it) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output for x in result: File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 339, in <genexpr> return (_set_referer(r) for r in result or ()) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr> return (r for r in result or () if _filter(r)) File "/Library/Python/2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr> return (r for r in result or () if _filter(r)) File "/Users/leeo/Desktop/Synyi/12.药品目录爬虫/丁香园药品助手/DXY_spider/DXY_spider/spiders/DXY.py", line 58, in parse yield response.follow(href, self.parse_drug_instruction) File "/Library/Python/2.7/site-packages/scrapy/http/response/text.py", line 157, in follow errback=errback File "/Library/Python/2.7/site-packages/scrapy/http/response/__init__.py", line 124, in follow raise ValueError("url can't be None") ValueError: url can't be None
跪求各位大佬指点迷津!!!
我的爬虫代码如下: `# -- coding: utf-8 -- import scrapy import requests
def get_proxy(): return requests.get("http://127.0.0.1:5010/get/").content
def delete_proxy(proxy): requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) def getHtml(href):
....
your spider code
yield Request(meta={'proxy': "http://%s"%(random.choice(["IP:PORT", "IP:PORT"]))})
class DXYSpider(scrapy.Spider):
`