parzival-au20 / sefa

0 stars 0 forks source link

Selenium-2 #46

Open parzival-au20 opened 5 months ago

parzival-au20 commented 5 months ago

time data 'MAR 26,2024' does not match format '%b %d, %Y' time data '26 Mar, 09:07' does not match format '%d %b,%A %H:%M' time data '19 Mart 2024, 11:07' does not match format '%d %B %Y, %A %H:%M'

parzival-au20 commented 5 months ago

Selenium.zip

parzival-au20 commented 5 months ago

https://developer.microsoft.com/tr-tr/microsoft-edge/tools/webdriver/?form=MA13LH#downloads

https://chromedriver.chromium.org/downloads/version-selection

parzival-au20 commented 5 months ago

Selenium (2.1).zip

parzival-au20 commented 5 months ago

text = text.replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')

parzival-au20 commented 5 months ago

Selenium.zip

parzival-au20 commented 5 months ago

Medya_Takip_Yeni_Siteler_V2.xlsx Selenium.zip

parzival-au20 commented 5 months ago

from datetime import datetime

date_str = "Apr\n16\n2024" date_obj = datetime.strptime(date_str, "%b\n%d\n%Y") print(date_obj)

parzival-au20 commented 5 months ago
        try:
            title = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h1 > a"))).text    
        except:
            print(link+" ERROR haberin title bulunamadı")
            continue
        try:    
            news_text = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".da_body"))).text
            READ_MORE_ROTOR_index  = news_text.find("READ MORE ROTOR")
            if READ_MORE_ROTOR_index !=-1:
                news_text = news_text[:READ_MORE_ROTOR_index]  
            print(news_text)
parzival-au20 commented 4 months ago

def text_to_date(self,text): try: if(self.web_site_name=="lockheedmartin"): locale.setlocale(locale.LC_ALL,self.formatType[f'{self.web_site_name}'][1]) return [datetime.datetime.strptime(date_str, '%b. %d, %Y') if '.' in date_str else datetime.datetime.strptime(date_str, '%B %d, %Y') for date_str in [text]][0]

        locale.setlocale(locale.LC_ALL,self.formatType[f'{self.web_site_name}'][1])
        return datetime.datetime.strptime(text,self.formatType[f'{self.web_site_name}'][0])
parzival-au20 commented 4 months ago

import threading import time from AinOnline import fetch_ainonline_news from AirportHaber import fetch_airporthaber_news from AIAA import fetch_AIAA_news from AirlineHaber import fetch_Airline_news from AirTurkHaber import fetch_AirTurk_news from BellFlight import fetch_BellFlight_news from DefenceNews import fetch_DefenceNews_news from DefenceWeb import fetch_DefenceWeb_news from Enstrom import fetch_Enstrom_news from GEAerospace import fetch_GEAerospace_news from HelicopterInvestor import fetch_HelicopterInvestor_news from MDHelicopters import fetch_MDHelicopters_news from Robinson import fetch_Robinson_news from TheWarzone import fetch_TheWarzone_news from TurDef import fetch_TurDef_news from VerticalMag import fetch_VerticalMag_news from airbuscorporatehelicopters import fetch_Airbus_news from DefenceTurk import fetch_DefenceTurk_news from DefenceTurkey import fetch_DefenceTurkey_news from SavunmaSanayist import fetch_SavunmaSanayist_news from RotorAndWing import fetch_RotorAndWing_news from lockheedmartin import fetch_lockheedmartin_news from HeliHub import fetch_HeliHub_news from Leonardo import fetch_Leanardo_news from JustHelicopters import fetch_JustHelicopters_news from AirNewsTimes import fetch_AirNewsTimes_news from DefenseHere import fetch_DefenseHere_news

start_time = time.time()

fetch_Airbus_news()

fetch_BellFlight_news()

fetch_DefenceNews_news()

fetch_Enstrom_news()

fetch_GEAerospace_news()

fetch_HelicopterInvestor_news()

fetch_AirTurk_news() fetch_VerticalMag_news() fetch_JustHelicopters_news() fetch_Leanardo_news()

fetch_HeliHub_news()

fetch_lockheedmartin_news() fetch_MDHelicopters_news() fetch_Robinson_news() fetch_RotorAndWing_news() fetch_TheWarzone_news() fetch_TurDef_news() fetch_SavunmaSanayist_news() fetch_DefenceTurk_news()

fetch_ainonline_news()

fetch_Airline_news()

fetch_airporthaber_news()

fetch_DefenceTurkey_news()

fetch_AirNewsTimes_news()

fetch_DefenceWeb_news()

fetch_AIAA_news()

fetch_DefenseHere_news()

end_time = time.time()

elapsed_time = end_time - start_time

print(f"işlem {elapsed_time} saniye sürdü")

haberlerini çeken fonksiyonları ayrı thread'lerde çalıştırma işlemi

AinOnline_thread = threading.Thread(target=fetch_ainonline_news, name="AinOnline")

AirportHaber_thread = threading.Thread(target=fetch_airporthaber_news, name="AirportHaber")

AIAA_thread = threading.Thread(target=fetch_AIAA_news, name="AIAA")

AirlineHaber_thread = threading.Thread(target=fetch_Airline_news, name="AirlineHaber")

AirTurkHaber_thread = threading.Thread(target=fetch_AirTurk_news, name="AirTurkHaber")

BellFlight_thread = threading.Thread(target=fetch_BellFlight_news, name="BellFlight")

DefenceNews_thread = threading.Thread(target=fetch_DefenceNews_news, name="DefenceNews")

DefenceWeb_thread = threading.Thread(target=fetch_DefenceWeb_news, name="DefenceWeb")

Enstrom_thread = threading.Thread(target=fetch_Enstrom_news, name="Enstrom")

GEAerospace_thread = threading.Thread(target=fetch_GEAerospace_news, name="GEAerospace")

HelicopterInvestor_thread = threading.Thread(target=fetch_HelicopterInvestor_news, name="HelicopterInvestor")

MDHelicopters_thread = threading.Thread(target=fetch_MDHelicopters_news, name="MDHelicopters")

Robinson_thread = threading.Thread(target=fetch_Robinson_news, name="Robinson")

TheWarzone_thread = threading.Thread(target=fetch_TheWarzone_news, name="TheWarzone")

TurDef_thread = threading.Thread(target=fetch_TurDef_news, name="TurDef")

Thread'leri başlatma

AinOnline_thread.start()

AirportHaber_thread.start()

AIAA_thread.start()

AirlineHaber_thread.start()

AirTurkHaber_thread.start()

BellFlight_thread.start()

DefenceNews_thread.start()

DefenceWeb_thread.start()

Enstrom_thread.start()

GEAerospace_thread.start()

HelicopterInvestor_thread.start()

MDHelicopters_thread.start()

Robinson_thread.start()

TheWarzone_thread.start()

TurDef_thread.start()

Ana programı bekletme

AinOnline_thread.join()

AirportHaber_thread.join()

AIAA_thread.join()

AirlineHaber_thread.join()

AirTurkHaber_thread.join()

BellFlight_thread.join()

DefenceNews_thread.join()

DefenceWeb_thread.join()

Enstrom_thread.join()

GEAerospace_thread.join()

HelicopterInvestor_thread.join()

MDHelicopters_thread.join()

Robinson_thread.join()

TheWarzone_thread.join()

TurDef_thread.join()

print("Haberler başarıyla çekildi.")

parzival-au20 commented 4 months ago

import threading import time

Haber çekme fonksiyonları

news_fetch_functions = [ fetch_Ainonline_news, fetch_Airporthaber_news, fetch_AIAA_news, fetch_Airline_news, fetch_AirTurk_news,

Diğer fonksiyonlar buraya eklenmeli

]

Thread'lerin atılacağı gruplar listesi

thread_groups = []

5'li gruplar halinde threadlere işlemleri atama

for i in range(0, len(news_fetch_functions), 5): group = [] for j in range(i, min(i+5, len(news_fetch_functions))): group.append(threading.Thread(target=news_fetch_functions[j])) thread_groups.append(group)

Başlangıç zamanını al

start_time = time.time()

Thread'leri başlatma

for group in thread_groups: for thread in group: thread.start()

Thread'leri bitirme

for group in thread_groups: for thread in group: thread.join()

Bitiş zamanını al

end_time = time.time()

Geçen süreyi hesapla

elapsed_time = end_time - start_time

print(f"işlem {elapsed_time} saniye sürdü") print("Haberler başarıyla çekildi.")

parzival-au20 commented 4 months ago

import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date

def fetch_ainonline_news(): service = Service("./chromedriver.exe") driver = webdriver.Chrome(service=service)

category = "Heli"
web_site_name = "AinOnline"

maxPage = 4
news_array = []
for pagenumber in range(maxPage):
    driver.get(f"https://www.ainonline.com/aviation-news/latest?page={pagenumber}")
    driver.maximize_window()
    time.sleep(3)
    try:
        news_list = driver.find_elements(By.CSS_SELECTOR, "a.Row_link__0_lcz")
    except:
        print(f"https://www.ainonline.com/aviation-news/latest?page={pagenumber} ,ERROR haberin Linki bulunamadı")
        return 
    news_link = []
    for item in news_list:
        href = item.get_attribute("href")
        news_link.append(href)

    for link in news_link:
        driver.get(link)
        try:
            title = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "Article_heading___vldJ"))).text    
        except:
            print(link+" ERROR haberin title bulunamadı")
            continue
        try:
            text_elements = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "wrap.ComponentWrap_wrap__f7e25")))
            news_text = ""
            for p_text in text_elements:
                news_text += p_text.text
        except:
            print(link+" ERROR haberin texti bulunamadı")
            continue
        try:
            date = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "BylineAndDate_date__dbokc"))).text 
        except:
            print(link+" ERROR haberin date bulunamadı")
            continue
        try:
            img_url = driver.find_element(By.CSS_SELECTOR, ".MediaWithCaption_media__FPJ_M >img")
            img_url = img_url.get_attribute("src")
        except:
            img_url = None

        if(Content_Text_Control(date, news_text, web_site_name)):
            news_array.append([link, category, img_url, news_text, text_to_date(date,web_site_name), title, web_site_name])
        else:
            continue

save_to_csv(news_array,web_site_name) 
parzival-au20 commented 4 months ago

Traceback (most recent call last): File "C:\Users\s20128\Desktop\Selenium\main2.py", line 85, in thread.join() File "C:\Python311\Lib\threading.py", line 1107, in join raise RuntimeError("cannot join thread before it is started") RuntimeError: cannot join thread before it is started

Process finished with exit code 1

parzival-au20 commented 4 months ago

Exception in thread Thread-5 (fetch_AirNewsTimes_news): Traceback (most recent call last): File "C:\Python311\Lib\threading.py", line 1038, in _bootstrap_inner self.run() File "C:\Python311\Lib\threading.py", line 975, in run self._target(*self._args, **self._kwargs) File "C:\Users\s20128\Desktop\Selenium\AirNewsTimes.py", line 22, in fetch_AirNewsTimes_news WebDriverWait(driver, 15).until( File "C:\Users\s20128\AppData\Roaming\Python\Python311\site-packages\selenium\webdriver\support\wait.py", line 105, in until raise TimeoutException(message, screen, stacktrace) selenium.common.exceptions.TimeoutException: Message:

parzival-au20 commented 4 months ago

import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date

def fetch_AirNewsTimes_news(): service = Service("./chromedriver.exe") driver = webdriver.Chrome(service=service)

category = "Heli"
web_site_name = "AirNewsTimes"

maxPage = 2
news_array = []
for page_number in range(1, maxPage):
    driver.get(f"https://www.airnewstimes.com/?s=helikopter")
    driver.maximize_window()
    try:
        WebDriverWait(driver, 15).until(
                EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div> div> div> div> div> div> div> div> h3> a")))
        #time.sleep(10)
        try:
            news_list = driver.find_elements(By.CSS_SELECTOR, "div> div> div> div> div> div> div> div> h3> a")
        except:
            print(f"https://www.airnewstimes.com/?s=helikopter")
            return
        news_link = []

        for item in news_list[5:23]:
            href = item.get_attribute("href")
            news_link.append(href)

        for link in news_link:
            driver.get(link)
            WebDriverWait(driver, 15).until(
                    EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div > div > div > div > div > img")))
            #time.sleep(5)
            try:
                title = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "article > div > div > div > div > div > div > div > div > h1"))).text
            except:
                print(link + " ERROR haberin title bulunamadı")
                continue
            try:
                text_elements = WebDriverWait(driver, 5).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article > div > div > div > div > div > div > div > div > div > div > div > div >p")))
                news_text = ""
                for p_text in text_elements:
                    news_text += p_text.text
            except:
                print(link + " ERROR haberin texti bulunamadı")
                continue
            try:
                date = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div > time"))).text
            except:
                print(link + " ERROR haberin date bulunamadı")
                continue
            try:
                img_url = driver.find_element(By.CSS_SELECTOR, "div > div > div > div > div > img")
                img_url = img_url.get_attribute("src")
            except:
                img_url = None

            if (Content_Text_Control(date, news_text, web_site_name)):
                news_array.append([link, category, img_url, news_text, text_to_date(date,web_site_name), title, web_site_name])
            else:
                continue
    except TimeoutError:

save_to_csv(news_array, web_site_name)