JaredLGillespie / proxyscrape

Python library for retrieving free proxies (HTTP, HTTPS, SOCKS4, SOCKS5).
MIT License
243 stars 55 forks source link

proxyscrape return none #34

Open VReunov opened 3 years ago

VReunov commented 3 years ago
>>> import proxyscrape
>>> collector = proxyscrape.create_collector('default', 'http')
>>> proxy = collector.get_proxy({'country': 'united states'})
>>> print(proxy)
None

Python 3.9.6

akguthal commented 3 years ago

I'm also having this issue

Mark7888 commented 3 years ago

same here

chikko80 commented 2 years ago

Do you guys find a solutions?

yoarch commented 1 year ago

Still same error.

8xu commented 1 year ago

Having the same issue. I think this project is not maintained anymore..

ydeng11 commented 8 months ago

Same issue. I think it is dead. I fork it and it is working now proxyscrape

zero-stroke commented 4 months ago

Same issue. I think it is dead. I fork it and it is working now proxyscrape

Your version works, but I dont know why or if I'm missing something but almost no free proxies work :( and I even made a multithreaded script to mass verify them

"""
python version >= 3.11
"""
import concurrent
import json
import os
import subprocess
import time
from concurrent.futures import ThreadPoolExecutor
from enum import auto, StrEnum
from threading import Lock

import proxyscrape
import requests
import datetime
from proxyscrape import Proxy
from requests.exceptions import ProxyError, ReadTimeout, SSLError, ConnectionError as ConnectionError_

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Cache-Control': 'max-age=0'
}

sleeptime = 1
ip_location_url = "https://www.iplocation.net/"
ipify_url = "https://api.ipify.org/?format=json"
timeout = 15

try:
    base_ip_address: str = requests.get(url=ipify_url, headers=headers).text
except SSLError as e:
    print("Error getting ip")

class ProxyType(StrEnum):
    http = auto()
    https = auto()
    socks4 = auto()
    socks5 = auto()

def get_api_proxies(proxy_type) -> list[str]:
    proxy_response = requests.get(
        f"https://api.proxyscrape.com/?request=displayproxies"
        f"&proxytype={proxy_type}"
        "&timeout=10000"
        "&country=all"
        "&ssl=all"
        "&anonymity=all")

    all_proxies: list[str] = proxy_response.text.split("\r\n")[:-1]

    for proxy in all_proxies:
        print(proxy)
    return all_proxies

def check_proxy(raw_proxy, proxy_type, selenium_check):
    print(f"\nProxy: {raw_proxy}")
    host, port = raw_proxy.split(":")  # 1.94.31.35:8888
    proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
    if validate_proxies(proxy, selenium_check):
        return proxy
    return None

def validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check) -> list:
    good_proxies = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        for proxy in executor.map(check_proxy, all_proxies, [proxy_type] * len(all_proxies),
                                  [selenium_check] * len(all_proxies)):
            if proxy is not None:
                good_proxies.append(proxy)
    return good_proxies

def test_proxies(selenium_check: bool, use_api: bool) -> list[Proxy]:
    """
        Quickly filter through list of proxies and find only the ones that have valid 200 response codes.
    Then verify that the ip address does indeed change.
    Then try with selenium.

    Example Proxy from proxyscrape collector:
    Proxy(host='1.2.7.9',port='32',code='us', country='iran', anonymous=T, type='https', source='sslproxies')

    Parameters
    ----------
    selenium_check : bool
        To check using selenium or not.
    use_api : bool

    Returns
    -------

    """
    good_proxies: list[Proxy] = []

    if use_api:
        proxy_type = ProxyType.socks4  # socks5 will only yield a handful
        all_proxies: list[str] = get_api_proxies(proxy_type)
        print(f"Looping through {len(all_proxies)} proxies")

        good_proxies = validate_proxies_multithreaded(all_proxies, proxy_type, selenium_check)

        # for raw_proxy in all_proxies:
        #     print(f"\nProxy: {raw_proxy}")
        #     host, port = raw_proxy.split(":")  # 1.94.31.35:8888
        #     proxy: Proxy = Proxy(host=host, port=port, code='us', country='', anonymous='T', type=proxy_type, source='')
        #     if validate_proxies(proxy, selenium_check):
        #         good_proxies.append(proxy)

        print(f"Percentage of good proxies: {len(good_proxies) / len(all_proxies)}")
    else:
        quality_proxy_types = (ProxyType.https, ProxyType.socks4, ProxyType.socks5)
        collector = proxyscrape.create_collector('default', quality_proxy_types)
        num_valid_proxies = 5
        max_workers = 25

        good_proxies_lock = Lock()

        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = set()

            while len(good_proxies) < num_valid_proxies:
                while len(futures) < max_workers:
                    proxy = collector.get_proxy()
                    print(f"\nProxy: {proxy}")
                    futures.add(executor.submit(validate_proxies, proxy, selenium_check))

                done, futures = concurrent.futures.wait(
                    futures,
                    return_when=concurrent.futures.FIRST_COMPLETED
                )

                for future in done:
                    result = future.result()
                    if result:
                        with good_proxies_lock:
                            good_proxies.append(result)

    date_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")  # '2024-06-22 17:34'
    filename = f"good_proxies{date_time}.json"
    with open(filename, 'w') as f:
        json.dump([proxy._asdict() for proxy in good_proxies], f, indent=4)

    print(f"Number of good proxies: {len(good_proxies)}")
    return good_proxies

def validate_proxies(proxy: Proxy, selenium_check: bool) -> Proxy | None:
    proxies = {
        "http": f"{proxy.type}://{proxy.host}:{proxy.port}",
        "https": f"{proxy.type}://{proxy.host}:{proxy.port}"
    }

    # ----- Check with requests -----
    try:
        response: requests.Response = requests.get(ip_location_url, headers=headers, proxies=proxies, timeout=7)
    except ProxyError:
        print("Proxy error")
        return None
    except SSLError:
        print("SLLError")
        return None
    except ConnectionError_:
        print("ConnectionError")
        return None
    except ReadTimeout:
        print("ReadTimeout error")
        return None
    if response.status_code != 200:
        print("Site rejected proxy")
        return None
    else:
        print("PASSED ip location check")

    response: requests.Response = requests.get(ipify_url, headers=headers, proxies=proxies)
    if response.text == base_ip_address:
        print("Didn't change ip")
        return None
    else:
        print("PASSED ip change check")

    # ----- Check with selenium -----
    if selenium_check:  # Likely not necessary, extra check if you want it and have the setup. 
        from data_agg_constants import adblock_path
        from selenium import webdriver
        from selenium.common import TimeoutException
        from selenium.webdriver.chrome.service import Service
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support import expected_conditions as EC
        from selenium.webdriver.support.wait import WebDriverWait

        import undetected_chromedriver as uc

        chrome_126_path = f"C:/Users/{os.getlogin()}/Downloads/chrome-win64 (2)/chrome-win64/chrome.exe"
        options = webdriver.ChromeOptions()
        options.binary_location = chrome_126_path
        options.add_extension(adblock_path)

        options.add_argument(f'--proxy-server=http://{proxy.host}:{proxy.port}')

        driver = uc.Chrome(service=Service(), options=options)  # noqa
        driver.set_page_load_timeout(timeout)

        try:
            driver.get(ip_location_url)
        except (TimeoutException, Exception):
            print(f"Loading failed with proxy {proxy.host}:{proxy.port}, trying a new proxy...")
            try:
                subprocess.run(["taskkill", "/F", "/IM", "chrome.exe"], check=True)
            except subprocess.CalledProcessError as err:
                if err.returncode == 128:  # ERROR_INVALID_HANDLE (0x80000003L)
                    print("Chrome.exe is not running.")
                else:
                    print(f"Error occurred: {err}")
                return None
            time.sleep(sleeptime)

        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        print("PASSED: Page loaded successfully")
        time.sleep(40)
        return proxy

def main():
    good_proxies = test_proxies(selenium_check=False, use_api=True)
    print(f"Good proxies: {good_proxies}")

if __name__ == '__main__':
    main()

oh and I couldnt open an issue on it for some reason so I posted here lol

ydeng11 commented 4 months ago

I think these sources are not working any more. These free proxies are barely click bait. These websites have the same structure and it leads me to think they are from the same source/owner.