Scrape a website asynchronously using a list of tor circuits

ghost commented 2 years ago

I want to scrape a website asynchronously using a list of tor circuits with different exit nodes and making sure each exit node only makes a request every 5 seconds.

For testing purposes, I'm using the website https://books.toscrape.com/ and I'm lowering the sleep time, number of circuits and number of pages to scrape.

It works fine without tor, but I'm getting the following error when I use tor.:

2022-09-06 11:08:49,380 [DEBUG] Loaded 10 authorities dir
2022-09-06 11:08:49,383 [DEBUG] Loaded 141 fallbacks dir
2022-09-06 11:08:49,383 [DEBUG] Using selector: EpollSelector
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
{}

import asyncio
import aiohttp
import logging

from docopt import docopt
from torpy import TorClient
from typing import Dict, List

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

def main():
    """
    Usage:
        scraper.py <url>... [--tor]
        scraper.py -h | --help

    Options:
        -h --help   Show this screen.
        --tor       Use tor to scrape website
    """
    args = docopt(main.__doc__)
    urls = args['<url>']
    tor = args['--tor']
    scrape_website(urls, tor)

def scrape_test_website() -> None:
    TEST_URL = "https://books.toscrape.com/catalogue/"
    urls = [f"{TEST_URL}page-{str(i)}.html" for i in range(1, 5)]
    print(scrape_website(urls, tor=True))

def scrape_website(urls: List[str], tor: bool = False) -> Dict:
    if tor:
        scraper = TorWebScraper(urls)
    else:
        scraper = WebScraper(urls)
    asyncio.run(scraper.run())
    return scraper.master_dict

class WebScraper(object):
    def __init__(self, urls: List[str]):
        self.urls = urls
        self.all_data = []
        self.master_dict = {}

    async def fetch(self, url: str) -> str:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    text = await response.text()
                    return url, text
        except Exception as e:
            logger.error(e)

    async def run(self) -> None:
        tasks = []
        for url in self.urls:
            tasks.append(self.fetch(url))
        self.all_data = await asyncio.gather(*tasks)
        for data in self.all_data:
            if data is not None:
                url = data[0]
                self.master_dict[url] = {'raw_html': data[1]}

def get_circuits(n: int = 2) -> List:
    """
    Get a list of one-hop tor circuits with different nodes
    """
    circuits = []
    with TorClient() as tor:
        for _ in range(n):
            circuits.append(tor.create_circuit())
    return circuits

class TorWebScraper(WebScraper):
    def __init__(self, urls: List[str]):
        super().__init__(urls)
        self.circuits = get_circuits(2)

    async def fetch(self, url: str) -> str:
        try:
            async with aiohttp.ClientSession() as session:
                for circuit in self.circuits:
                    async with circuit.create_stream() as stream:
                        async with session.get(url, proxy=stream.proxy) as response:
                            await asyncio.sleep(20e-3)
                            text = await response.text()
                            return url, text
        except Exception as e:
            logger.error(e)

if __name__ == '__main__':
    #main()
    scrape_test_website()

killerart commented 2 years ago

you need to create circuits like this

with tor.create_circuit(3) as circuit:
     ...

ghost commented 2 years ago

So I can only use one circuit? There is no way of using multiple circuits in parallel?

For now I solved this by using another script that calls many times the main script and puts the process in background. I use torsocks -i which uses 3 jumps while I only need one.

do-me commented 1 year ago

You can use more than one circuit in parallel. I invested quite some time to figure out a logic that deals with parallel processing, failed requests and new circuit creation. If you're still interested have a look at the source code of fast-instagram-scraper.

torpyorg / torpy

Scrape a website asynchronously using a list of tor circuits #44