Closed synodriver closed 2 weeks ago
The spider with following settings
class ExampleSpider(scrapy.Spider): name = "example" allowed_domains = ["example.com"] start_urls = ["https://example.com"] custom_settings = { "DNS_RESOLVER": "test1.resolver.CachingAsyncDohResolver" }
do not follow the custom_settings
import asyncio import json import re import socket from typing import List, Optional
import aiodns import aiohttp import scrapy.crawler from scrapy.resolver import CachingThreadedResolver, dnscache from scrapy.utils.defer import deferred_from_coro from twisted.internet.interfaces import IResolverSimple from zope.interface.declarations import implementer
@implementer(IResolverSimple) class CachingAsyncResolver(CachingThreadedResolver): """ Async caching resolver. Require aiodns """
def __init__( self, reactor, cache_size, timeout, nameservers: Optional[List[str]] = None, **kwargs, ): super().__init__(reactor, cache_size, timeout) self._resolver = aiodns.DNSResolver(nameservers, None, **kwargs) self._pattern = re.compile( r"((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.){3}(1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)" ) @classmethod def from_crawler(cls, crawler: scrapy.crawler.Crawler, reactor): if crawler.settings.getbool("DNSCACHE_ENABLED"): cache_size = crawler.settings.getint("DNSCACHE_SIZE") else: cache_size = 0 return cls( reactor, cache_size, crawler.settings.getfloat("DNS_TIMEOUT"), crawler.settings.getlist("AIODNS_NAMESERVERS", None), **crawler.settings.getdict("AIODNS_KW", {}), ) def getHostByName(self, name, timeout=None): print(f"resolving name {name}") return deferred_from_coro(self._getHostByName(name, timeout)) async def _getHostByName(self, name, timeout=None): if self._pattern.match(name) is not None: # just an ip return name if name in dnscache: return dnscache[name] try: resp = await asyncio.wait_for( self._resolver.gethostbyname(name, socket.AF_INET), timeout ) result = resp.addresses[0] self._cache_result(result, name) return result except asyncio.TimeoutError: raise except aiodns.error.DNSError as exc: msg = exc.args[1] if len(exc.args) >= 1 else "DNS lookup failed" raise OSError(msg) from exc
And run spider **Expected behavior:** The log by that resolver is seen **Actual behavior:** The crawler didn't call that resolver **Reproduces how often:** Always, both on linux and windows ### Versions Scrapy : 2.11.1 lxml : 4.9.2.0 libxml2 : 2.9.12 cssselect : 1.2.0 parsel : 1.8.1 w3lib : 2.1.1 Twisted : 24.3.0 Python : 3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:34:57) [MSC v.1936 64 bit (AMD64)] pyOpenSSL : 23.3.0 (OpenSSL 3.1.4 24 Oct 2023) cryptography : 41.0.5 Platform : Windows-10-10.0.19045-SP0 Scrapy : 2.11.1 lxml : 4.9.2.0 libxml2 : 2.9.14 cssselect : 1.2.0 parsel : 1.8.1 w3lib : 2.1.1 Twisted : 24.3.0 Python : 3.10.11 (main, Apr 20 2023, 19:02:41) [GCC 11.2.0] pyOpenSSL : 23.1.1 (OpenSSL 3.1.0 14 Mar 2023) cryptography : 40.0.2 Platform : Linux-5.10.0-13-amd64-x86_64-with-glibc2.31 ### Additional context It works if you put it into global settings.py
Duplicate of #5988
Description
The spider with following settings
do not follow the custom_settings
Steps to Reproduce
import aiodns import aiohttp import scrapy.crawler from scrapy.resolver import CachingThreadedResolver, dnscache from scrapy.utils.defer import deferred_from_coro from twisted.internet.interfaces import IResolverSimple from zope.interface.declarations import implementer
@implementer(IResolverSimple) class CachingAsyncResolver(CachingThreadedResolver): """ Async caching resolver. Require aiodns """