Open parzival-au20 opened 5 months ago
text = text.replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
from datetime import datetime
date_str = "Apr\n16\n2024" date_obj = datetime.strptime(date_str, "%b\n%d\n%Y") print(date_obj)
try:
title = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "h1 > a"))).text
except:
print(link+" ERROR haberin title bulunamadı")
continue
try:
news_text = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".da_body"))).text
READ_MORE_ROTOR_index = news_text.find("READ MORE ROTOR")
if READ_MORE_ROTOR_index !=-1:
news_text = news_text[:READ_MORE_ROTOR_index]
print(news_text)
def text_to_date(self,text): try: if(self.web_site_name=="lockheedmartin"): locale.setlocale(locale.LC_ALL,self.formatType[f'{self.web_site_name}'][1]) return [datetime.datetime.strptime(date_str, '%b. %d, %Y') if '.' in date_str else datetime.datetime.strptime(date_str, '%B %d, %Y') for date_str in [text]][0]
locale.setlocale(locale.LC_ALL,self.formatType[f'{self.web_site_name}'][1])
return datetime.datetime.strptime(text,self.formatType[f'{self.web_site_name}'][0])
import threading import time from AinOnline import fetch_ainonline_news from AirportHaber import fetch_airporthaber_news from AIAA import fetch_AIAA_news from AirlineHaber import fetch_Airline_news from AirTurkHaber import fetch_AirTurk_news from BellFlight import fetch_BellFlight_news from DefenceNews import fetch_DefenceNews_news from DefenceWeb import fetch_DefenceWeb_news from Enstrom import fetch_Enstrom_news from GEAerospace import fetch_GEAerospace_news from HelicopterInvestor import fetch_HelicopterInvestor_news from MDHelicopters import fetch_MDHelicopters_news from Robinson import fetch_Robinson_news from TheWarzone import fetch_TheWarzone_news from TurDef import fetch_TurDef_news from VerticalMag import fetch_VerticalMag_news from airbuscorporatehelicopters import fetch_Airbus_news from DefenceTurk import fetch_DefenceTurk_news from DefenceTurkey import fetch_DefenceTurkey_news from SavunmaSanayist import fetch_SavunmaSanayist_news from RotorAndWing import fetch_RotorAndWing_news from lockheedmartin import fetch_lockheedmartin_news from HeliHub import fetch_HeliHub_news from Leonardo import fetch_Leanardo_news from JustHelicopters import fetch_JustHelicopters_news from AirNewsTimes import fetch_AirNewsTimes_news from DefenseHere import fetch_DefenseHere_news
start_time = time.time()
fetch_AirTurk_news() fetch_VerticalMag_news() fetch_JustHelicopters_news() fetch_Leanardo_news()
fetch_lockheedmartin_news() fetch_MDHelicopters_news() fetch_Robinson_news() fetch_RotorAndWing_news() fetch_TheWarzone_news() fetch_TurDef_news() fetch_SavunmaSanayist_news() fetch_DefenceTurk_news()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"işlem {elapsed_time} saniye sürdü")
print("Haberler başarıyla çekildi.")
import threading import time
news_fetch_functions = [ fetch_Ainonline_news, fetch_Airporthaber_news, fetch_AIAA_news, fetch_Airline_news, fetch_AirTurk_news,
]
thread_groups = []
for i in range(0, len(news_fetch_functions), 5): group = [] for j in range(i, min(i+5, len(news_fetch_functions))): group.append(threading.Thread(target=news_fetch_functions[j])) thread_groups.append(group)
start_time = time.time()
for group in thread_groups: for thread in group: thread.start()
for group in thread_groups: for thread in group: thread.join()
end_time = time.time()
elapsed_time = end_time - start_time
print(f"işlem {elapsed_time} saniye sürdü") print("Haberler başarıyla çekildi.")
import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date
def fetch_ainonline_news(): service = Service("./chromedriver.exe") driver = webdriver.Chrome(service=service)
category = "Heli"
web_site_name = "AinOnline"
maxPage = 4
news_array = []
for pagenumber in range(maxPage):
driver.get(f"https://www.ainonline.com/aviation-news/latest?page={pagenumber}")
driver.maximize_window()
time.sleep(3)
try:
news_list = driver.find_elements(By.CSS_SELECTOR, "a.Row_link__0_lcz")
except:
print(f"https://www.ainonline.com/aviation-news/latest?page={pagenumber} ,ERROR haberin Linki bulunamadı")
return
news_link = []
for item in news_list:
href = item.get_attribute("href")
news_link.append(href)
for link in news_link:
driver.get(link)
try:
title = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, "Article_heading___vldJ"))).text
except:
print(link+" ERROR haberin title bulunamadı")
continue
try:
text_elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "wrap.ComponentWrap_wrap__f7e25")))
news_text = ""
for p_text in text_elements:
news_text += p_text.text
except:
print(link+" ERROR haberin texti bulunamadı")
continue
try:
date = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, "BylineAndDate_date__dbokc"))).text
except:
print(link+" ERROR haberin date bulunamadı")
continue
try:
img_url = driver.find_element(By.CSS_SELECTOR, ".MediaWithCaption_media__FPJ_M >img")
img_url = img_url.get_attribute("src")
except:
img_url = None
if(Content_Text_Control(date, news_text, web_site_name)):
news_array.append([link, category, img_url, news_text, text_to_date(date,web_site_name), title, web_site_name])
else:
continue
save_to_csv(news_array,web_site_name)
Traceback (most recent call last):
File "C:\Users\s20128\Desktop\Selenium\main2.py", line 85, in
Process finished with exit code 1
Exception in thread Thread-5 (fetch_AirNewsTimes_news): Traceback (most recent call last): File "C:\Python311\Lib\threading.py", line 1038, in _bootstrap_inner self.run() File "C:\Python311\Lib\threading.py", line 975, in run self._target(*self._args, **self._kwargs) File "C:\Users\s20128\Desktop\Selenium\AirNewsTimes.py", line 22, in fetch_AirNewsTimes_news WebDriverWait(driver, 15).until( File "C:\Users\s20128\AppData\Roaming\Python\Python311\site-packages\selenium\webdriver\support\wait.py", line 105, in until raise TimeoutException(message, screen, stacktrace) selenium.common.exceptions.TimeoutException: Message:
import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date
def fetch_AirNewsTimes_news(): service = Service("./chromedriver.exe") driver = webdriver.Chrome(service=service)
category = "Heli"
web_site_name = "AirNewsTimes"
maxPage = 2
news_array = []
for page_number in range(1, maxPage):
driver.get(f"https://www.airnewstimes.com/?s=helikopter")
driver.maximize_window()
try:
WebDriverWait(driver, 15).until(
EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div> div> div> div> div> div> div> div> h3> a")))
#time.sleep(10)
try:
news_list = driver.find_elements(By.CSS_SELECTOR, "div> div> div> div> div> div> div> div> h3> a")
except:
print(f"https://www.airnewstimes.com/?s=helikopter")
return
news_link = []
for item in news_list[5:23]:
href = item.get_attribute("href")
news_link.append(href)
for link in news_link:
driver.get(link)
WebDriverWait(driver, 15).until(
EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div > div > div > div > div > img")))
#time.sleep(5)
try:
title = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "article > div > div > div > div > div > div > div > div > h1"))).text
except:
print(link + " ERROR haberin title bulunamadı")
continue
try:
text_elements = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article > div > div > div > div > div > div > div > div > div > div > div > div >p")))
news_text = ""
for p_text in text_elements:
news_text += p_text.text
except:
print(link + " ERROR haberin texti bulunamadı")
continue
try:
date = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div > time"))).text
except:
print(link + " ERROR haberin date bulunamadı")
continue
try:
img_url = driver.find_element(By.CSS_SELECTOR, "div > div > div > div > div > img")
img_url = img_url.get_attribute("src")
except:
img_url = None
if (Content_Text_Control(date, news_text, web_site_name)):
news_array.append([link, category, img_url, news_text, text_to_date(date,web_site_name), title, web_site_name])
else:
continue
except TimeoutError:
save_to_csv(news_array, web_site_name)
time data 'MAR 26,2024' does not match format '%b %d, %Y' time data '26 Mar, 09:07' does not match format '%d %b,%A %H:%M' time data '19 Mart 2024, 11:07' does not match format '%d %B %Y, %A %H:%M'