Open parzival-au20 opened 4 months ago
import time import requests import random from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging
def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = []
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
]
maxPage = 1
for page_number in range(maxPage):
url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
try:
for i in range(20):
headers = {"User-Agent": random.choice(user_agents)}
response = requests.get(url, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt")
if response.status_code != 200:
logging.error(url + " " + response.reason + " " + str(response.status_code))
continue
else:
break
if response.status_code != 200:
logging.error(response.reason + " " + str(response.status_code) + " DefenceWeb scraping işlemi BAŞARISIZ")
return
soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
logging.error(f"Error fetching page {url}: {e}")
continue
news_list = soup.find_all(class_="td-image-wrap")
news_links = [link['href'] for link in news_list]
for idx, link in enumerate(news_links):
try:
for i in range(10):
headers = {"User-Agent": random.choice(user_agents)}
response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt")
if response.status_code != 200:
logging.error(response.reason + " " + str(response.status_code))
continue
else:
break
if response.status_code != 200:
logging.error(response.reason + " " + str(response.status_code) + link)
continue
news_soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
logging.error(f"Error fetching news page {link}: {e}")
continue
title = news_soup.select_one("h1.entry-title").text.strip()
news_text = news_soup.select_one(".td-post-content").text.strip()
date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None
if Content_Text_Control(date, news_text, web_site_name):
news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
else:
continue
# Her 5 istekte bir bekletme süresi ekle
if (idx + 1) % 5 == 0:
time.sleep(5)
save_to_csv(news_array, web_site_name)
Traceback (most recent call last): File "C:\Users\s20128\Desktop\BeautifulSoap\main.py", line 5, in
from AinOnline import fetch_ainonline_news
File "C:\Users\s20128\Desktop\BeautifulSoap\AinOnline.py", line 7, in
from requests_ntlm import HttpNtlmAuth
File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\requests_ntlm__init__.py", line 1, in
from .requests_ntlm import HttpNtlmAuth
File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\requests_ntlm\requests_ntlm.py", line 5, in
from cryptography import x509
File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\cryptography\x509__init__.py", line 7, in
from cryptography.x509 import certificate_transparency, verification
File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\cryptography\x509\certificate_transparency.py", line 11, in
from cryptography.hazmat.bindings._rust import x509 as rust_x509
ImportError: cannot import name 'x509' from 'cryptography.hazmat.bindings._rust' (unknown location)