Closed apavlo89 closed 3 years ago
from GoogleNews import GoogleNews
from newspaper import Article
from newspaper import Config
import pandas as pd
import nltk
#config will allow us to access the specified url for which we are #not authorized. Sometimes we may get 403 client error while parsing #the link to download the article.
nltk.download('punkt')
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
googlenews=GoogleNews(start='10/01/2020',end='10/19/2020')
googlenews.search('amazon stock')
result=googlenews.result()
df=pd.DataFrame(result)
print(df.head())
for i in range(2,20):
googlenews.getpage(i)
result=googlenews.result()
df=pd.DataFrame(result)
list=[]
for ind in df.index:
dict={}
article = Article(df['link'][ind],config=config)
article.download()
article.parse()
article.nlp()
dict['Date']=df['date'][ind]
dict['Media']=df['media'][ind]
dict['Title']=article.title
dict['Article']=article.text
dict['Summary']=article.summary
list.append(dict)
news_df=pd.DataFrame(list)
news_df.to_excel("articles.xlsx")
fixed by changing user agent to user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
I can't seem to be able to download articles due to this error. How do I bypass this? it seems to be happening everytime it goes through a specific URL. Basically if its not able to go through that website I would like it to skip and keep searching. Thank you for your help in the matter.