DuckDuckGo scraper fails

sander-van-damme commented 1 year ago

Hi there!

I noticed an issue with the DuckDuckGo scraper. The response HTML doesn't contain any search results, only some Javascript, so the selectors don't match anything.

I created a temporary fix below (based on the Google scraper), which uses html.duckduckgo.com instead of duckduckgo.com.

Thanks!

output:

Searching Duckduckgo
Traceback (most recent call last):
  File "C:\Users\default\scrape.py", line 10, in <module>
    engine.search('test')
  File "C:\Users\default\search_engines\engine.py", line 162, in search
    response = self._get_page(request['url'], request['data'])
  File "C:\Users\default\search_engines\engines\duckduckgo.py", line 44, in _get_page
    response = self._http_client.get(page)
  File "C:\Users\default\search_engines\http_client.py", line 21, in get
    page = self._quote(page)
  File "C:\Users\default\search_engines\http_client.py", line 41, in _quote
    if utl.decode_bytes(utl.unquote_url(url)) == utl.decode_bytes(url):
  File "C:\Users\default\search_engines\utils.py", line 15, in unquote_url
    return decode_bytes(requests.utils.unquote(url))
  File "C:\Users\default\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 655, in unquote
    if '%' not in string:

fix:

from ..engine import SearchEngine
from ..config import PROXY, TIMEOUT, FAKE_USER_AGENT
from ..utils import unquote_url, quote_url

class Duckduckgo(SearchEngine):
    '''Searches duckduckgo.com'''
    def __init__(self, proxy=PROXY, timeout=TIMEOUT):
        super(Duckduckgo, self).__init__(proxy, timeout)
        self._base_url = u'https://html.duckduckgo.com'
        self._current_page = 1
        self.set_headers({'User-Agent':FAKE_USER_AGENT})

    def _selectors(self, element):
        '''Returns the appropriate CSS selector.'''
        selectors = {
            'url': 'a.result__a', 
            'title': 'a.result__a', 
            'text': 'a.result__snippet',
            'links': 'div#links div.result',
            'next': 'input[value="next"]'
        }
        return selectors[element]

    def _first_page(self):
        '''Returns the initial page and query.'''
        url = u'{}/html/?q={}'.format(self._base_url, quote_url(self._query, ''))
        return {'url':url, 'data':None}

    def _next_page(self, tags):
        '''Returns the next page URL and post data (if any)'''
        self._current_page += 1
        selector = self._selectors('next').format(page=self._current_page)
        next_page = self._get_tag_item(tags.select_one(selector), 'href')
        url = None
        if next_page:
            url = self._base_url + next_page
        return {'url':url, 'data':None}

    def _get_url(self, tag, item='href'):
        '''Returns the URL of search results item.'''
        selector = self._selectors('url')
        url = self._get_tag_item(tag.select_one(selector), item)

        if url.startswith(u'/url?q='):
            url = url.replace(u'/url?q=', u'').split(u'&sa=')[0]
        return unquote_url(url)

tasos-py commented 1 year ago

Thanks for bringing this to my attention, and for taking the time to solve it. Your contribution is deeply appreciated. I hope you don't mind me using your code, until I find a solution for the main domain

sander-van-damme commented 1 year ago

Hi, I see you reopened this issue. Probably because the scraper doesn't go to the next page. I encountered this too a couple of days ago and created a quick & dirty fix.

from ..engine import SearchEngine
from ..config import PROXY, TIMEOUT, FAKE_USER_AGENT
from ..utils import unquote_url, quote_url
from urllib.parse import unquote

class Duckduckgo(SearchEngine):
    '''Searches duckduckgo.com'''
    def __init__(self, proxy=PROXY, timeout=TIMEOUT):
        super(Duckduckgo, self).__init__(proxy, timeout)
        self._base_url = u'https://html.duckduckgo.com'
        self._delay = (2, 6)
        self._current_page = 1
        self.set_headers({'User-Agent':FAKE_USER_AGENT})

    def _selectors(self, element):
        '''Returns the appropriate CSS selector.'''
        selectors = {
            'url': 'a.result__a', 
            'title': 'a.result__a', 
            'text': 'a.result__snippet',
            'links': 'div#links div.result',
            'next': {'form':'form', 'submit': 'input[value="Next"]'}
        }
        return selectors[element]

    def _first_page(self):
        '''Returns the initial page and query.'''
        url = u'{}/html/?q={}'.format(self._base_url, quote_url(self._query, ''))
        return {'url':url, 'data':None}

    def _next_page(self, tags):
        '''Returns the next page URL and post data (if any)'''
        selector = self._selectors('next')
        forms = [
            form 
            for form in tags.select(selector['form']) 
            if form.select(selector['submit'])
        ]
        url = None
        if forms:
            url = self._base_url + forms[0]['action'] + '?'
            for input in forms[0].select('input[name]'):
                url += f'{input["name"]}={input.get("value", "") }&'
        return {'url': url, 'data': None}

    def _get_url(self, tag, item='href'):
        '''Returns the URL of search results item.'''
        selector = self._selectors('url')
        url = self._get_tag_item(tag.select_one(selector), item)
        if url.startswith(u'//duckduckgo.com/l/?uddg='):
            url = url.replace(u'//duckduckgo.com/l/?uddg=', u'').split(u'&rut')[0]
        return unquote(url)

tasos-py commented 1 year ago

Actually, I reopened it by accident! Please don't waste any more of your time on this, you've already done more than enough. I'll take care of any remaining issues when I have some free time

tasos-py / Search-Engines-Scraper

DuckDuckGo scraper fails #59