def scrape_website_content(self, website_url, failed_sites=[]):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://www.google.com/',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Accept-Encoding': 'gzip, deflate, br'
}
def is_garbled(text):
# Count non-ASCII characters
non_ascii_chars = sum(1 for char in text if char not in string.printable)
try:
# Calculate the proportion of non-ASCII characters
return non_ascii_chars / len(text) > 0.2
except ZeroDivisionError:
# If the text is empty, it cannot be garbled
return False
try:
# Making a GET request to the website
response = requests.get(website_url, headers=headers, timeout=15)
response.raise_for_status() # This will raise an exception for HTTP errors
# Detecting encoding using chardet
detected_encoding = chardet.detect(response.content)
response.encoding = detected_encoding['encoding'] if detected_encoding['confidence'] > 0.5 else 'utf-8'
# Handling possible issues with encoding detection
try:
content = response.text
except UnicodeDecodeError:
content = response.content.decode('utf-8', errors='replace')
# Parsing the page content using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text(separator='\n')
# Cleaning up the text: removing excess whitespace
clean_text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
split_text = clean_text.split()
first_5k_words = split_text[:5000]
clean_text_5k = ' '.join(first_5k_words)
if is_garbled(clean_text):
print(f"Failed to retrieve content from {website_url} due to garbled text.")
failed = {"source": website_url, "content": "Failed to retrieve content due to garbled text"}
failed_sites.append(website_url)
return failed, failed_sites, False
return {"source": website_url, "content": clean_text_5k}, "N/A", True
My modification:
def scrape_site_jina(self, website_url, failed_sites=[]):
prefixurl="https://r.jina.ai/"
response = requests.get(prefixurl+website_url)
if response.status_code == 200:
print(response.text)
return {"source": website_url, "content":response.text[0:20*1000]}, "N/A", True
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
failed = {"source": website_url, "content": "Failed to retrieve content due to an error: "}
failed_sites.append(website_url)
return failed, failed_sites, False
Current code :
My modification:
Additional links for the jina ai reader api : https://jina.ai/reader/#demo