Open grinchify opened 3 years ago
Update - Even with a simple for loop, the thread count rises without the class being threaded. Just a simple for loop calling the class.
Thanks for raising this. Are you able to share the code for ImageScraper
? That will help in reproducing the issue.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumwire import webdriver
import time, requests
import os.path
from os import path
from bs4 import BeautifulSoup
import random, json
import os
from datetime import date
import threading
current_time = str(date.today()) + f"-{int(time.time())}"
os.makedirs(f'captchas/{current_time}')
count = 0
lock = threading.Lock()
with open('config.json', 'r') as file:
data = json.load(file)
number_of_browsers = data['Number of Browsers']
link = data['Link']
headless = data ['Headless']
class ImageScraper(threading.Thread):
def __init__(self, link, proxy):
threading.Thread.__init__(self)
self.link = link
self.proxy = proxy
self.image_list = []
def image_getter(self):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36')
if headless == True:
options.add_argument('--headless')
if self.proxy.count(":") == 3:
host, port, user, password = self.proxy.split(":")
self.proxy = f"{user}:{password}@{host}:{port}"
sele_options = {
'proxy': {
'http': f'http://{self.proxy}',
'https': f'https://{self.proxy}',
'no_proxy': 'localhost,127.0.0.1'
}
}
self.driver = webdriver.Chrome(options=options, seleniumwire_options=sele_options)
self.driver.get(self.link)
WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[title='widget containing checkbox for hCaptcha security challenge']")))
print("Switching into frame")
print("Waiting for captcha ...")
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, "checkbox"))).click()
print("Captcha Detected")
self.driver.switch_to.default_content()
WebDriverWait(self.driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[title='Main content of the hCaptcha challenge']")))
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "icon")))
print("Images loaded in")
for request in self.driver.requests:
if "https://hcaptcha.com/getcaptcha?" in request.url:
self.hcaptcha_obj = json.loads(request.response.body)
break
return 0
def image_downloader(self):
global count
headers = {
'accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'referer': 'https://newassets.hcaptcha.com/',
'sec-fetch-dest': 'image',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'sec-gpc': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
for link in self.image_list:
page = requests.get(link, headers = headers)
lock.acquire()
with open(f'captchas/{current_time}/{self.category}/image{count}.png', 'wb') as file:
file.write(page.content)
count += 1
lock.release()
self.driver.close()
return 0
def hcaptcha_parser(self):
self.category = self.hcaptcha_obj['requester_question']['en'].split()[-1]
print(f"Category is {self.category}")
lock.acquire()
if not path.exists(f'captchas/{current_time}/{self.category}'):
os.makedirs(f'captchas/{current_time}/{self.category}')
print("New Category Detected. Making Folder for it now")
else:
print("Category already exists")
lock.release()
for task in self.hcaptcha_obj['tasklist']:
self.image_list.append(task['datapoint_uri'])
return 0
def run(self):
try:
self.image_getter()
self.hcaptcha_parser()
self.image_downloader()
return 0
except:
print("Error while finding captcha")
self.driver.close()
return 0
def threading_module():
with open("proxies.txt", 'r') as file:
proxy_list = [proxy.strip() for proxy in file]
for i in range(0, number_of_browsers):
print("Staring Browser")
t = ImageScraper(link, random.choice(proxy_list))
t.start()
time.sleep(60)
print(f"There are currently {threading.active_count()} threads active")
threading_module()
threading_module()
The threading is horrible in the code due to the bug. My real threading code if this were to work would be
def threader():
with open("proxies.txt", 'r') as file:
proxy_list = [proxy.strip() for proxy in file]
while True:
if threading.active_count() < number_of_browsers + 1:
t = ImageScraper(link, random.choice(proxy_list))
t.start()
else:
time.sleep(3)
pass
Issue still presists.
This is my threading my threading code. The code works and calls ImageScraper successfully and the whole loop works when using selenium without selenium wire. However If I am using seleniumwire it tends to call the class ImageScraper 5 times but then it stops. After printing I see that there are 11 threads remaining and I tend to get the following error sometimes:
127.0.0.1:49306: Traceback (most recent call last): File "C:\Users\safee\AppData\Roaming\Python\Python39\site-packages\seleniumwire\thirdparty\mitmproxy\net\tcp.py", line 196, in peek return self.o._sock.recv(length, socket.MSG_PEEK) ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host