wkeeling / selenium-wire

Extends Selenium's Python bindings to give you the ability to inspect requests made by the browser.
MIT License
1.88k stars 246 forks source link

Threading #370

Open grinchify opened 3 years ago

grinchify commented 3 years ago
def threading_module():

    with open("proxies.txt", 'r') as file:
        proxy_list = [proxy.strip() for proxy in file]

    while True:
        print(f"There is {threading.active_count()} threads active")
        if threading.active_count() < 6
            t = ImageScraper(link, random.choice(proxy_list))
            t.start()

        else:
            time.sleep(10)
            pass

threading_module()

This is my threading my threading code. The code works and calls ImageScraper successfully and the whole loop works when using selenium without selenium wire. However If I am using seleniumwire it tends to call the class ImageScraper 5 times but then it stops. After printing I see that there are 11 threads remaining and I tend to get the following error sometimes:

127.0.0.1:49306: Traceback (most recent call last): File "C:\Users\safee\AppData\Roaming\Python\Python39\site-packages\seleniumwire\thirdparty\mitmproxy\net\tcp.py", line 196, in peek return self.o._sock.recv(length, socket.MSG_PEEK) ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

grinchify commented 3 years ago

Update - Even with a simple for loop, the thread count rises without the class being threaded. Just a simple for loop calling the class.

wkeeling commented 3 years ago

Thanks for raising this. Are you able to share the code for ImageScraper? That will help in reproducing the issue.

grinchify commented 3 years ago
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumwire import webdriver
import time, requests
import os.path
from os import path
from bs4 import BeautifulSoup
import random, json
import os
from datetime import date
import threading

current_time = str(date.today()) + f"-{int(time.time())}"
os.makedirs(f'captchas/{current_time}')
count = 0
lock = threading.Lock()

with open('config.json', 'r') as file:
    data = json.load(file)

number_of_browsers = data['Number of Browsers']
link = data['Link']
headless = data ['Headless']

class ImageScraper(threading.Thread):

    def __init__(self, link, proxy):
        threading.Thread.__init__(self)
        self.link = link
        self.proxy = proxy
        self.image_list = []

    def image_getter(self):
        options = webdriver.ChromeOptions()
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument("--disable-blink-features=AutomationControlled") 
        options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36')
        if headless == True:
            options.add_argument('--headless')

        if self.proxy.count(":") == 3:

            host, port, user, password = self.proxy.split(":")
            self.proxy = f"{user}:{password}@{host}:{port}"

        sele_options = {
            'proxy': {
                'http': f'http://{self.proxy}',
                'https': f'https://{self.proxy}',
                'no_proxy': 'localhost,127.0.0.1'
            }
        }

        self.driver = webdriver.Chrome(options=options, seleniumwire_options=sele_options)
        self.driver.get(self.link)

        WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[title='widget containing checkbox for hCaptcha security challenge']")))
        print("Switching into frame")
        print("Waiting for captcha ...")
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, "checkbox"))).click()
        print("Captcha Detected")
        self.driver.switch_to.default_content()
        WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[title='Main content of the hCaptcha challenge']")))
        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "icon")))
        print("Images loaded in")

        for request in self.driver.requests:  
            if "https://hcaptcha.com/getcaptcha?" in request.url:
                self.hcaptcha_obj = json.loads(request.response.body)
                break
        return 0

    def image_downloader(self):
        global count
        headers = {
            'accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9',
            'referer': 'https://newassets.hcaptcha.com/',
            'sec-fetch-dest': 'image',
            'sec-fetch-mode': 'no-cors',
            'sec-fetch-site': 'same-site',
            'sec-gpc': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
        }

        for link in self.image_list:
            page = requests.get(link, headers = headers)
            lock.acquire()
            with open(f'captchas/{current_time}/{self.category}/image{count}.png', 'wb') as file:
                file.write(page.content)
            count += 1     
            lock.release()

        self.driver.close()  
        return 0    

    def hcaptcha_parser(self):

        self.category = self.hcaptcha_obj['requester_question']['en'].split()[-1]
        print(f"Category is {self.category}")
        lock.acquire()
        if not path.exists(f'captchas/{current_time}/{self.category}'):
            os.makedirs(f'captchas/{current_time}/{self.category}')
            print("New Category Detected. Making Folder for it now")
        else:
            print("Category already exists")
        lock.release()
        for task in self.hcaptcha_obj['tasklist']:
            self.image_list.append(task['datapoint_uri'])

        return 0

    def run(self):
        try:
            self.image_getter()
            self.hcaptcha_parser()
            self.image_downloader()
            return 0
        except:
            print("Error while finding captcha")
            self.driver.close()
            return 0

def threading_module():

    with open("proxies.txt", 'r') as file:
        proxy_list = [proxy.strip() for proxy in file]

    for i in range(0, number_of_browsers):
        print("Staring Browser")
        t = ImageScraper(link, random.choice(proxy_list))
        t.start()

    time.sleep(60)
    print(f"There are currently {threading.active_count()} threads active")
    threading_module()

threading_module()
grinchify commented 3 years ago

The threading is horrible in the code due to the bug. My real threading code if this were to work would be

def threader():

    with open("proxies.txt", 'r') as file:
        proxy_list = [proxy.strip() for proxy in file]

    while True:

        if threading.active_count() < number_of_browsers + 1:
            t = ImageScraper(link, random.choice(proxy_list))
            t.start()
        else:
            time.sleep(3)
            pass
grinchify commented 3 years ago

Issue still presists.