Open parzival-au20 opened 7 months ago
import time from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date
def fetch_HelicopterInvestor_news(): service = Service("./msedgedriver.exe") driver = webdriver.Edge(service=service)
category = "Heli"
web_site_name = "HelicopterInvestor"
maxPage = 1
news_array = []
for pagenumber in range(maxPage):
driver.get(f"https://www.helicopterinvestor.com/news/")
driver.maximize_window()
time.sleep(3) # Sayfanın yüklenmesini bekle
# Sayfa kaynağını al
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Haber başlıklarını al
news_list = soup.select("article > .articleExcerpt > h3 > a")
news_link = [item['href'] for item in news_list]
for link in news_link[:20]:
driver.get(link)
# Sayfa kaynağını al
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
try:
title = soup.select_one("article > h1").text.strip()
news_text = "\n".join([p.text.strip() for p in soup.select("article > p")])
date = soup.select_one(".metaStrip > time").text.strip()
img_url = soup.select_one(".featuredImage > img")['src']
except Exception as e:
print(f"Error: {e} - {link}")
continue
if Content_Text_Control(date, news_text, web_site_name):
news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
else:
continue
save_to_csv(news_array, web_site_name)
fetch_HelicopterInvestor_news()
time data '23 Mar 2024, 2024 12:17' does not match format '%d %b %Y, %H:%M' time data '21 Mar 2024, 2024 11:59' does not match format '%d %b %Y, %H:%M' time data '21 Mar 2024, 2024 11:59' does not match format '%d %b %Y, %H:%M' time data '20 Mar 2024, 2024 12:21' does not match format '%d %b %Y, %H:%M' time data '20 Mar 2024, 2024 12:21' does not match format '%d %b %Y, %H:%M'
import time from selenium import webdriver from selenium.webdriver.edge.service import Service from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from convert_csv import save_to_csv, Content_Text_Control, text_to_date
def fetch_HelicopterInvestor_news(): service = Service("./msedgedriver.exe") driver = webdriver.Edge(service=service)