parzival-au20 / Node-JS

0 stars 0 forks source link

beautifulsoap #2

Open parzival-au20 opened 4 months ago

parzival-au20 commented 4 months ago

Traceback (most recent call last): File "C:\Users\s20128\Desktop\BeautifulSoap\main.py", line 5, in from AinOnline import fetch_ainonline_news File "C:\Users\s20128\Desktop\BeautifulSoap\AinOnline.py", line 7, in from requests_ntlm import HttpNtlmAuth File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\requests_ntlm__init__.py", line 1, in from .requests_ntlm import HttpNtlmAuth File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\requests_ntlm\requests_ntlm.py", line 5, in from cryptography import x509 File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\cryptography\x509__init__.py", line 7, in from cryptography.x509 import certificate_transparency, verification File "C:\Users\s20128\AppData\Local\Programs\Python\Python312\Lib\site-packages\cryptography\x509\certificate_transparency.py", line 11, in from cryptography.hazmat.bindings._rust import x509 as rust_x509 ImportError: cannot import name 'x509' from 'cryptography.hazmat.bindings._rust' (unknown location)

parzival-au20 commented 4 months ago

import time import requests import random from bs4 import BeautifulSoup from convert_csv import save_to_csv, Content_Text_Control, text_to_date import logging

def fetch_DefenceWeb_news(): category = "Heli" web_site_name = "DefenceWeb" news_array = []

user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
]

maxPage = 1
for page_number in range(maxPage):
    url = "https://www.defenceweb.co.za/category/aerospace/aerospace/"
    try:
        for i in range(20):
            headers = {"User-Agent": random.choice(user_agents)}
            response = requests.get(url, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt")
            if response.status_code != 200:
                logging.error(url + " " + response.reason + " " + str(response.status_code))
                continue
            else:
                break
        if response.status_code != 200:
            logging.error(response.reason + " " + str(response.status_code) + " DefenceWeb scraping işlemi BAŞARISIZ")
            return
        soup = BeautifulSoup(response.text, 'html.parser')
    except Exception as e:
        logging.error(f"Error fetching page {url}: {e}")
        continue

    news_list = soup.find_all(class_="td-image-wrap")
    news_links = [link['href'] for link in news_list]

    for idx, link in enumerate(news_links):
        try:
            for i in range(10):
                headers = {"User-Agent": random.choice(user_agents)}
                response = requests.get(link, headers=headers, verify="./certificate/TUSAS_DefenceWeb.crt")
                if response.status_code != 200:
                    logging.error(response.reason + " " + str(response.status_code))
                    continue
                else:
                    break
            if response.status_code != 200:
                logging.error(response.reason + " " + str(response.status_code) + link)
                continue
            news_soup = BeautifulSoup(response.text, 'html.parser')
        except Exception as e:
            logging.error(f"Error fetching news page {link}: {e}")
            continue

        title = news_soup.select_one("h1.entry-title").text.strip()
        news_text = news_soup.select_one(".td-post-content").text.strip()
        date = news_soup.select_one(".td-post-date > time").text.strip().replace('th', '').replace('nd', '').replace('st', '').replace('rd', '')
        img_url = news_soup.select_one(".td-modal-image")['src'] if news_soup.select_one(".td-modal-image") else None

        if Content_Text_Control(date, news_text, web_site_name):
            news_array.append([link, category, img_url, news_text, text_to_date(date, web_site_name), title, web_site_name])
        else:
            continue

        # Her 5 istekte bir bekletme süresi ekle
        if (idx + 1) % 5 == 0:
            time.sleep(5)

save_to_csv(news_array, web_site_name)
parzival-au20 commented 4 months ago

5-Poster-Sablonu-Proje-pazarı.pptx