parzival-au20 / sefa

0 stars 0 forks source link

selenuim edge #48

Open parzival-au20 opened 7 months ago

parzival-au20 commented 7 months ago

import time from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date

def fetch_HelicopterInvestor_news(): service = Service("./msedgedriver.exe") driver = webdriver.Edge(service=service)

category = "Heli"
web_site_name = "HelicopterInvestor"

maxPage = 1
news_array = []
for pagenumber in range(maxPage):
    driver.get(f"https://www.helicopterinvestor.com/news/")
    driver.maximize_window()
    time.sleep(3)
    try:
        news_list = driver.find_elements(By.CSS_SELECTOR, "article>.articleExcerpt > h3 > a")
    except:
        print(f"https://www.helicopterinvestor.com/news/ ,ERROR haberin Linki bulunamadı")
        return
    news_link = []
    for item in news_list:
        href = item.get_attribute("href")
        news_link.append(href)

    for link in news_link[:20]:
        driver.get(link)
        try:
            title = WebDriverWait(driver, 1).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "article>h1"))).text    
        except:
            print(link+" ERROR haberin title bulunamadı")
            continue
        try:    
            text_elements = WebDriverWait(driver, 1).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article>p")))
            news_text = ""
            for p_text in text_elements:
                news_text += p_text.text
        except:
            print(link+" ERROR haberin texti bulunamadı")
            continue
        try:
            date = WebDriverWait(driver, 1).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".metaStrip > time"))).text 
        except:
            print(link+" ERROR haberin date bulunamadı")
            continue
        try:
            img_url = driver.find_element(By.CSS_SELECTOR, ".featuredImage > img")
            img_url = img_url.get_attribute("src")
        except:
            img_url = None

        if(Content_Text_Control(date, news_text, web_site_name)):
            news_array.append([link, category, img_url, news_text, text_to_date(date,web_site_name), title, web_site_name])
        else:
            continue

save_to_csv(news_array,web_site_name) 
parzival-au20 commented 7 months ago

import time from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date

def fetch_HelicopterInvestor_news(): service = Service("./msedgedriver.exe") driver = webdriver.Edge(service=service)

category = "Heli"
web_site_name = "HelicopterInvestor"

maxPage = 1
news_array = []
for pagenumber in range(maxPage):
    driver.get(f"https://www.helicopterinvestor.com/news/")
    driver.maximize_window()
    time.sleep(3)  # Sayfanın yüklenmesini bekle
    # Sayfa kaynağını al
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    # Haber başlıklarını al
    news_list = soup.select("article > .articleExcerpt > h3 > a")
    news_link = [item['href'] for item in news_list]

    for link in news_link[:20]:
        driver.get(link)
        # Sayfa kaynağını al
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        try:
            title = soup.select_one("article > h1").text.strip()
            news_text = "\n".join([p.text.strip() for p in soup.select("article > p")])
            date = soup.select_one(".metaStrip > time").text.strip()
            img_url = soup.select_one(".featuredImage > img")['src']
        except Exception as e:
            print(f"Error: {e} - {link}")
            continue

        if Content_Text_Control(date, news_text, web_site_name):
            news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
        else:
            continue

save_to_csv(news_array, web_site_name)

Fonksiyonu çağır

fetch_HelicopterInvestor_news()

parzival-au20 commented 7 months ago

time data '23 Mar 2024, 2024 12:17' does not match format '%d %b %Y, %H:%M' time data '21 Mar 2024, 2024 11:59' does not match format '%d %b %Y, %H:%M' time data '21 Mar 2024, 2024 11:59' does not match format '%d %b %Y, %H:%M' time data '20 Mar 2024, 2024 12:21' does not match format '%d %b %Y, %H:%M' time data '20 Mar 2024, 2024 12:21' does not match format '%d %b %Y, %H:%M'