Maybe stream=True with stream_timeout can be optional params to download()?
Something like this might work, I monkey patched download():
from newspaper import network
from newspaper.network import _get_html_from_response
import requests
from newspaper.article import ArticleDownloadState
def get_html_2XX_only(url, config=None, response=None, stream=False, stream_timeout=30):
"""Consolidated logic for http requests from newspaper. We handle error cases:
- Attempt to find encoding of the html by using HTTP header. Fallback to
'ISO-8859-1' if not provided.
- Error out if a non 2XX HTTP response code is returned.
"""
config = config or Configuration()
useragent = config.browser_user_agent
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
if response is not None:
return _get_html_from_response(response)
if stream:
response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers), stream=True)
body = []
start = time.time()
for chunk in response.iter_content(1024):
body.append(chunk)
if time.time() - start > stream_timeout:
logging.error(f"Stream timed out for url: {url}")
break
body = b''.join(body)
response._content = body
else:
response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
html = _get_html_from_response(response)
if config.http_success_only:
# fail if HTTP sends a non 2XX response
response.raise_for_status()
return html
def download(self, input_html=None, title=None, recursion_counter=0, stream=False, stream_timeout=30):
"""Downloads the link's HTML content, don't use if you are batch async
downloading articles
recursion_counter (currently 1) stops refreshes that are potentially
infinite
"""
if input_html is None:
try:
html = get_html_2XX_only(self.url, self.config, stream=stream, stream_timeout=stream_timeout)
except requests.exceptions.RequestException as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = str(e)
log.debug('Download failed on URL %s because of %s' %
(self.url, self.download_exception_msg))
return
else:
html = input_html
if self.config.follow_meta_refresh:
meta_refresh_url = extract_meta_refresh(html)
if meta_refresh_url and recursion_counter < 1:
return self.download(
input_html=network.get_html(meta_refresh_url),
recursion_counter=recursion_counter + 1)
self.set_html(html)
self.set_title(title)
newspaper.Article.download = download
The following doesn't timeout nor return anything.
Same with:
Related issue: https://github.com/psf/requests/issues/1577
Maybe
stream=True
withstream_timeout
can be optional params todownload()
?Something like this might work, I monkey patched
download()
: