Open parzival-au20 opened 6 months ago
import requests from bs4 import BeautifulSoup import itertools
proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]
proxy_cycle = itertools.cycle(proxy_list)
url = 'http://example.com'
def fetch_with_proxy(url, proxy): proxies = { 'http': proxy, 'https': proxy, } response = requests.get(url, proxies=proxies) return response
for _ in range(5): current_proxy = next(proxy_cycle) try: response = fetch_with_proxy(url, current_proxy) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') print(f"Başlık (Proxy: {current_proxy}): {soup.title.string}") else: print(f"Başarısız istek (Proxy: {current_proxy}): {response.status_code}") except Exception as e: print(f"Proxy ile bağlantı hatası ({current_proxy}): {e}")
`import time import requests from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging import itertools
proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]
proxy_cycle = itertools.cycle(proxy_list)
def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}
maxPage = 1
for page_number in range(maxPage):
url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
current_proxy = next(proxy_cycle)
proxies = {
'http': current_proxy,
'https': current_proxy,
}
try:
for i in range(20):
next(proxy_cycle)
response = requests.get(url, headers=headers,verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
if response.status_code !=200:
logging.error(url+" "+response.reason+" "+str(response.status_code))
continue
else:
break
if response.status_code !=200:
logging.error(response.reason+" "+str(response.status_code)+" DefenceWeb scraping islemi BASARİSİZ" )
return
soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
logging.error(f"Error fetching page {url}: {e}")
continue
news_list = soup.find_all(class_="td-image-wrap")
news_links = [link['href'] for link in news_list]
for link in news_links:
try:
for i in range(10):
next(proxy_cycle)
response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
if response.status_code !=200:
logging.error(response.reason+" "+str(response.status_code))
continue
else:
break
if response.status_code !=200:
logging.error(response.reason+" "+str(response.status_code)+link)
continue
news_soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
logging.error(f"Error fetching news page {link}: {e}")
continue
title = news_soup.select_one("h1.entry-title").text.strip()
news_text = news_soup.select_one(".td-post-content").text.strip()
date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None
if Content_Text_Control(date, news_text, web_site_name):
news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
else:
continue
save_to_csv(news_array, web_site_name)`
`import time import requests from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging import itertools
proxy_list = [ 'http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port' ]
proxy_cycle = itertools.cycle(proxy_list)
def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}
maxPage = 1
for page_number in range(maxPage):
url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
response = None
for _ in range(20): # 20 deneme hakkı
current_proxy = next(proxy_cycle)
proxies = {
'http': current_proxy,
'https': current_proxy,
}
try:
response = requests.get(url, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
if response.status_code == 200:
break
else:
logging.error(url + " " + response.reason + " " + str(response.status_code))
except Exception as e:
logging.error(f"Error fetching page {url} with proxy {current_proxy}: {e}")
if response is None or response.status_code != 200:
logging.error(response.reason + " " + str(response.status_code) + " DefenceWeb scraping islemi BASARİSİZ")
return
soup = BeautifulSoup(response.text, 'html.parser')
news_list = soup.find_all(class_="td-image-wrap")
news_links = [link['href'] for link in news_list]
for link in news_links:
response = None
for _ in range(10): # 10 deneme hakkı
current_proxy = next(proxy_cycle)
proxies = {
'http': current_proxy,
'https': current_proxy,
}
try:
response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt", proxies=proxies)
if response.status_code == 200:
break
else:
logging.error(link + " " + response.reason + " " + str(response.status_code))
except Exception as e:
logging.error(f"Error fetching news page {link} with proxy {current_proxy}: {e}")
if response is None or response.status_code != 200:
logging.error(response.reason + " " + str(response.status_code) + link)
continue
news_soup = BeautifulSoup(response.text, 'html.parser')
try:
title = news_soup.select_one("h1.entry-title").text.strip()
news_text = news_soup.select_one(".td-post-content").text.strip()
date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None
if Content_Text_Control(date, news_text, web_site_name):
news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
except Exception as e:
logging.error(f"Error parsing news page {link}: {e}")
save_to_csv(news_array, web_site_name)
`
Error fetching page https://www.defenceweb.co.za/category/aerospace/aerospace/: HTTPSConnectionPool(host='www.defenceweb.co.za', port=443): Max retries exceeded with url: /category/aerospace/aerospace/ (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000205B95B75C0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it')))
BeautifulSoap23-05.zip