deedy5 / duckduckgo_search

Search for words, documents, images, videos, news, maps and text translation using the DuckDuckGo.com search engine. Downloading files and images to a local hard drive.
MIT License
927 stars 117 forks source link

facing SSL connection timeout OR Proxy CONNECT timeout with AsyncDDGS #191

Closed adhadse closed 3 months ago

adhadse commented 3 months ago

Hey there.

I'm facing an issue running AsyncDDGS with proxy (http endpoint). Even after multiple retries, there are multiple search failing and only few returning results.

Most prominently I receieved were:

Could it be that there are too many connection being made via the Async code and so multiple connections are failing via proxy?

Code:

import loguru
import asyncio
import itertools
from duckduckgo_search import AsyncDDGS

logger = loguru.logger

# Async retry decorator
def wait_incrementing(start, increment, max_wait):
    async def wait_strategy(retry_number):
        return min(start + increment * retry_number, max_wait)
    return wait_strategy

def stop_after_attempt(max_attempts):
    async def stop_strategy(retry_number):
        return retry_number >= max_attempts
    return stop_strategy

def async_retry(wait=None, stop=None):
    if wait is None:
        wait = asyncio.sleep

    if stop is None:
        async def stop_strategy(retry_number):
            return False
    else:
        stop_strategy = stop

    def decorator(func):
        @wraps(func)
        async def wrapper(self, *args, **kwargs):
            retry_number = 0
            while True:
                try:
                    return await func(self, *args, **kwargs)
                except Exception as e:
                    retry_number += 1
                    logger.info(
                        f"Retrying {retry_number}",
                        keyword=kwargs["query"],
                    )
                    if await stop_strategy(retry_number):
                        raise e
                    await asyncio.sleep(await wait(retry_number))
        return wrapper
    return decorator

def exception_handling(func):
    @wraps(func)
    async def wrapper(self, *args, **kwargs):
        try:
            return await func(self, *args, **kwargs)
        except Exception as e:
            logger.exception(f"Exception handled {e}")
            return []
    return wrapper

# operation functions

@exception_handling
@async_retry(
    wait=wait_incrementing(start=5, increment=10, max_wait=30),
    stop=stop_after_attempt(5)
)
async def aget_results(query):
    results = await AsyncDDGS(proxies=PROXY).news(
        keywords=word, region="wt-wt", safesearch="off", timelimit="1d", max_results=20
    )
    return results

async def main():
    words = ["sun", "earth", "moon"] * 1_000  # increase the number
    tasks = [aget_results(query=w) for w in words]
    results = await asyncio.gather(*tasks)
    results = list(itertools.chain.from_iterable(results))
    print(len(results))
    return results

if __name__ == "__main__":
    results = asyncio.run(main())
    print(len(results))

What could be the problem, if the number of connections is the problem how can I control it. I do remember semaphores exist but how to use it here?

deedy5 commented 3 months ago

Your code doesn't work

deedy5 commented 3 months ago

Use asyncio.Semaphore to limit the number of concurrent requests. Increase timeout. Correct 'timelimit'.

import asyncio
import logging

from duckduckgo_search import AsyncDDGS

logging.basicConfig(level=logging.DEBUG)
SEM = asyncio.Semaphore(10)
proxies = "socks5://localhost:9150"

async def aget_results(keywords):
    async with SEM:
        while True:
            try:
                results = await AsyncDDGS(proxies=proxies, timeout=20).news(
                    keywords,
                    region="wt-wt",
                    safesearch="off",
                    timelimit="d",
                    max_results=20,
                )
                return results
            except Exception as ex:
                logging.warning(f"{type(ex).__name__}: {ex}")

async def main():
    words = ["sun", "earth", "moon"]
    tasks = [aget_results(keywords=w) for w in words]
    results = await asyncio.gather(*tasks)
    for r in results:      
        print(r)
        print(len(r))
    return results

if __name__ == "__main__":
    asyncio.run(main())