Closed sun254667307 closed 1 month ago
我这里测试是没有问题的,推荐你排查一下网络,有没有使用vpn等代理工具。如果访问非常快会触发马蜂窝的反扒机制,需要实现一下代理类。
我这里测试是没有问题的,推荐你排查一下网络,有没有使用vpn等代理工具。如果访问非常快会触发马蜂窝的反扒机制,需要实现一下代理类。
class MaFengWoSpider(BaseSpider):
def __init__(self, config: Optional[SpiderConfig] = None):
self.client = httpx.Client(**BaseSpider.convert_2_httpx_client_arg(config))
self.a_client = httpx.AsyncClient(**BaseSpider.convert_2_httpx_client_arg(config))
self._config = self._merge_config(config)
if not self._config.headers:
# for linux
self._config.headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
}
class MaFengWoSpider(BaseSpider): def __init__(self, config: Optional[SpiderConfig] = None): self.client = httpx.Client(**BaseSpider.convert_2_httpx_client_arg(config)) self.a_client = httpx.AsyncClient(**BaseSpider.convert_2_httpx_client_arg(config)) self._config = self._merge_config(config) if not self._config.headers: # for linux self._config.headers = { "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", }
应该和user-agent没有关系。
Traceback (most recent call last): File "test.py", line 6, in
doc = auto_spider.crawl(url)
File "/home/dataharvest/src/dataharvest/spider/spider.py", line 25, in crawl
return spider.crawl(url, config)
File "/home/dataharvest/src/dataharvest/spider/mafengwo_spider.py", line 70, in crawl
final_content = handle_final_content(third_resp)
File "/home/dataharvest/src/dataharvest/spider/mafengwo_spider.py", line 158, in handle_final_content
third_resp.raise_for_status()
File "/opt/conda/envs/wenlv/lib/python3.8/site-packages/httpx/_models.py", line 761, in raise_for_status
raise HTTPStatusError(message, request=request, response=self)
httpx.HTTPStatusError: Server error '521 Connection Reset by Origin' for url 'https://www.mafengwo.cn/i/24424554.html'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/521