Open VReunov opened 3 years ago
I'm also having this issue
same here
Do you guys find a solutions?
Still same error.
Having the same issue. I think this project is not maintained anymore..
Same issue. I think it is dead. I fork it and it is working now proxyscrape
Same issue. I think it is dead. I fork it and it is working now proxyscrape
Your version works, but I dont know why or if I'm missing something but almost no free proxies work :( and I even made a multithreaded script to mass verify them
"""
python version >= 3.11
"""
import concurrent
import json
import os
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor
from enum import auto, StrEnum
from threading import Lock
import proxyscrape
import requests
import datetime
from proxyscrape import Proxy
from requests.exceptions import ProxyError, ReadTimeout, SSLError, ConnectionError as ConnectionError_
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
sleeptime = 1
ip_location_url = "https://www.iplocation.net/"
ipify_url = "https://api.ipify.org/?format=json"
timeout = 15
try:
base_ip_address: str = requests.get(url=ipify_url, headers=headers).text
except SSLError as e:
print("Error getting ip")
class ProxyType(StrEnum):
http = auto()
https = auto()
socks4 = auto()
socks5 = auto()
def get_api_proxies(proxy_type) -> list[str]:
proxy_response = requests.get(
f"https://api.proxyscrape.com/?request=displayproxies"
f"&proxytype={proxy_type}"
"&timeout=10000"
"&country=all"
"&ssl=all"
"&anonymity=all")
all_proxies: list[str] = proxy_response.text.split("\r\n")[:-1]
for proxy in all_proxies:
print(proxy)
return all_proxies
def check_proxy(raw_proxy, proxy_type, selenium_check):
print(f"\nProxy: {raw_proxy}")
host, port = raw_proxy.split(":") # 1.94.31.35:8888
proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
if validate_proxies(proxy, selenium_check):
return proxy
return None
def validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check) -> list:
good_proxies = []
with concurrent.futures.ThreadPoolExecutor() as executor:
for proxy in executor.map(check_proxy, all_proxies, [proxy_type] * len(all_proxies),
[selenium_check] * len(all_proxies)):
if proxy is not None:
good_proxies.append(proxy)
return good_proxies
def test_proxies(selenium_check: bool, use_api: bool) -> list[Proxy]:
"""
Quickly filter through list of proxies and find only the ones that have valid 200 response codes.
Then verify that the ip address does indeed change.
Then try with selenium.
Example Proxy from proxyscrape collector:
Proxy(host='1.2.7.9',port='32',code='us', country='iran', anonymous=T, type='https', source='sslproxies')
Parameters
----------
selenium_check : bool
To check using selenium or not.
use_api : bool
Returns
-------
"""
good_proxies: list[Proxy] = []
if use_api:
proxy_type = ProxyType.socks4 # socks5 will only yield a handful
all_proxies: list[str] = get_api_proxies(proxy_type)
print(f"Looping through {len(all_proxies)} proxies")
good_proxies = validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check)
# for raw_proxy in all_proxies:
# print(f"\nProxy: {raw_proxy}")
# host, port = raw_proxy.split(":") # 1.94.31.35:8888
# proxy: Proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
# if validate_proxies(proxy, selenium_check):
# good_proxies.append(proxy)
print(f"Percentage of good proxies: {len(good_proxies) / len(all_proxies)}")
else:
quality_proxy_types = (ProxyType.https, ProxyType.socks4, ProxyType.socks5)
collector = proxyscrape.create_collector('default', quality_proxy_types)
num_valid_proxies = 5
max_workers = 25
good_proxies_lock = Lock()
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = set()
while len(good_proxies) < num_valid_proxies:
while len(futures) < max_workers:
proxy = collector.get_proxy()
print(f"\nProxy: {proxy}")
futures.add(executor.submit(validate_proxies, proxy, selenium_check))
done, futures = concurrent.futures.wait(
futures,
return_when=concurrent.futures.FIRST_COMPLETED
)
for future in done:
result = future.result()
if result:
with good_proxies_lock:
good_proxies.append(result)
date_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") # '2024-06-22 17:34'
filename = f"good_proxies{date_time}.json"
with open(filename, 'w') as f:
json.dump([proxy._asdict() for proxy in good_proxies], f, indent=4)
print(f"Number of good proxies: {len(good_proxies)}")
return good_proxies
def validate_proxies(proxy: Proxy, selenium_check: bool) -> Proxy | None:
proxies = {
"http": f"{proxy.type}://{proxy.host}:{proxy.port}",
"https": f"{proxy.type}://{proxy.host}:{proxy.port}"
}
# ----- Check with requests -----
try:
response: requests.Response = requests.get(ip_location_url, headers=headers, proxies=proxies, timeout=7)
except ProxyError:
print("Proxy error")
return None
except SSLError:
print("SLLError")
return None
except ConnectionError_:
print("ConnectionError")
return None
except ReadTimeout:
print("ReadTimeout error")
return None
if response.status_code != 200:
print("Site rejected proxy")
return None
else:
print("PASSED ip location check")
response: requests.Response = requests.get(ipify_url, headers=headers, proxies=proxies)
if response.text == base_ip_address:
print("Didn't change ip")
return None
else:
print("PASSED ip change check")
# ----- Check with selenium -----
if selenium_check: # Likely not necessary, extra check if you want it and have the setup.
from data_agg_constants import adblock_path
from selenium import webdriver
from selenium.common import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import undetected_chromedriver as uc
chrome_126_path = f"C:/Users/{os.getlogin()}/Downloads/chrome-win64 (2)/chrome-win64/chrome.exe"
options = webdriver.ChromeOptions()
options.binary_location = chrome_126_path
options.add_extension(adblock_path)
options.add_argument(f'--proxy-server=http://{proxy.host}:{proxy.port}')
driver = uc.Chrome(service=Service(), options=options) # noqa
driver.set_page_load_timeout(timeout)
try:
driver.get(ip_location_url)
except (TimeoutException, Exception):
print(f"Loading failed with proxy {proxy.host}:{proxy.port}, trying a new proxy...")
try:
subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"], check=True)
except subprocess.CalledProcessError as err:
if err.returncode == 128: # ERROR_INVALID_HANDLE (0x80000003L)
print("Chrome.exe is not running.")
else:
print(f"Error occurred: {err}")
return None
time.sleep(sleeptime)
WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
print("PASSED: Page loaded successfully")
time.sleep(40)
return proxy
def main():
good_proxies = test_proxies(selenium_check=False, use_api=True)
print(f"Good proxies: {good_proxies}")
if __name__ == '__main__':
main()
oh and I couldnt open an issue on it for some reason so I posted here lol
I think these sources are not working any more. These free proxies are barely click bait. These websites have the same structure and it leads me to think they are from the same source/owner.
Python 3.9.6