parzival-au20 / sefa

0 stars 0 forks source link

beautiful #54

Open parzival-au20 opened 6 months ago

parzival-au20 commented 6 months ago

BeautifulSoap23-05.zip

parzival-au20 commented 6 months ago

import requests from bs4 import BeautifulSoup import itertools

Proxy listesi

proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]

Proxy listesini döngüsel olarak kullanmak için cycle oluşturun

proxy_cycle = itertools.cycle(proxy_list)

URL'yi belirleyin

url = 'http://example.com'

def fetch_with_proxy(url, proxy): proxies = { 'http': proxy, 'https': proxy, } response = requests.get(url, proxies=proxies) return response

Örnek olarak 5 istek gönderelim

for _ in range(5): current_proxy = next(proxy_cycle) try: response = fetch_with_proxy(url, current_proxy) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') print(f"Başlık (Proxy: {current_proxy}): {soup.title.string}") else: print(f"Başarısız istek (Proxy: {current_proxy}): {response.status_code}") except Exception as e: print(f"Proxy ile bağlantı hatası ({current_proxy}): {e}")

parzival-au20 commented 6 months ago

`import time import requests from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging import itertools

Proxy listesi

proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]

Proxy listesini döngüsel olarak kullanmak için cycle oluşturun

proxy_cycle = itertools.cycle(proxy_list)

def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}

maxPage = 1
for page_number in range(maxPage):
    url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
    current_proxy = next(proxy_cycle)
    proxies = {
        'http': current_proxy,
        'https': current_proxy,
    }
    try:
        for i in range(20):
            next(proxy_cycle)
            response = requests.get(url, headers=headers,verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
            if response.status_code !=200:
                logging.error(url+" "+response.reason+" "+str(response.status_code))
                continue
            else:
                break
        if response.status_code !=200:
            logging.error(response.reason+" "+str(response.status_code)+"  DefenceWeb scraping islemi BASARİSİZ" )
            return
        soup = BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        logging.error(f"Error fetching page {url}: {e}")
        continue

    news_list = soup.find_all(class_="td-image-wrap")
    news_links = [link['href'] for link in news_list]

    for link in news_links:
        try:
            for i in range(10):
                next(proxy_cycle)
                response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
                if response.status_code !=200:
                    logging.error(response.reason+" "+str(response.status_code))
                    continue
                else:
                    break
            if response.status_code !=200:
                logging.error(response.reason+" "+str(response.status_code)+link)
                continue
            news_soup = BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            logging.error(f"Error fetching news page {link}: {e}")
            continue

        title = news_soup.select_one("h1.entry-title").text.strip()
        news_text = news_soup.select_one(".td-post-content").text.strip()
        date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
        img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None

        if Content_Text_Control(date, news_text, web_site_name):
            news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
        else:
            continue

save_to_csv(news_array, web_site_name)`
parzival-au20 commented 6 months ago

`import time import requests from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging import itertools

Proxy listesi

proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]

Proxy listesini döngüsel olarak kullanmak için cycle oluşturun

proxy_cycle = itertools.cycle(proxy_list)

def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}

maxPage = 1
for page_number in range(maxPage):
    url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
    response = None

    for _ in range(20):  # 20 deneme hakkı
        current_proxy = next(proxy_cycle)
        proxies = {
            'http': current_proxy,
            'https': current_proxy,
        }
        try:
            response = requests.get(url, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
            if response.status_code == 200:
                break
            else:
                logging.error(url + " " + response.reason + " " + str(response.status_code))
        except Exception as e:
            logging.error(f"Error fetching page {url} with proxy {current_proxy}: {e}")

    if response is None or response.status_code != 200:
        logging.error(response.reason + " " + str(response.status_code) + "  DefenceWeb scraping islemi BASARİSİZ")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    news_list = soup.find_all(class_="td-image-wrap")
    news_links = [link['href'] for link in news_list]

    for link in news_links:
        response = None
        for _ in range(10):  # 10 deneme hakkı
            current_proxy = next(proxy_cycle)
            proxies = {
                'http': current_proxy,
                'https': current_proxy,
            }
            try:
                response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
                if response.status_code == 200:
                    break
                else:
                    logging.error(link + " " + response.reason + " " + str(response.status_code))
            except Exception as e:
                logging.error(f"Error fetching news page {link} with proxy {current_proxy}: {e}")

        if response is None or response.status_code != 200:
            logging.error(response.reason + " " + str(response.status_code) + link)
            continue

        news_soup = BeautifulSoup(response.text, 'html.parser')
        try:
            title = news_soup.select_one("h1.entry-title").text.strip()
            news_text = news_soup.select_one(".td-post-content").text.strip()
            date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
            img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None

            if Content_Text_Control(date, news_text, web_site_name):
                news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
        except Exception as e:
            logging.error(f"Error parsing news page {link}: {e}")

save_to_csv(news_array, web_site_name)

`

parzival-au20 commented 6 months ago

Error fetching page https://www.defenceweb.co.za/category/aerospace/aerospace/: HTTPSConnectionPool(host='www.defenceweb.co.za', port=443): Max retries exceeded with url: /category/aerospace/aerospace/ (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000205B95B75C0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it')))