codelucas / newspaper

newspaper3k is a news, full-text, and article metadata extraction in Python 3. Advanced docs:
https://goo.gl/VX41yK
MIT License
13.89k stars 2.1k forks source link

download() halts/stuck forever with a specific URL #964

Open KeremTurgutlu opened 1 year ago

KeremTurgutlu commented 1 year ago

The following doesn't timeout nor return anything.

url = "http://http-live.sr.se/srextra01-mp3-192"
article = newspaper.Article(url, request_timeout=5)
article.download()

Same with:

from newspaper.network import get_html_2XX_only

article.config.__dict__
{'MIN_WORD_COUNT': 300,
 'MIN_SENT_COUNT': 7,
 'MAX_TITLE': 200,
 'MAX_TEXT': 100000,
 'MAX_KEYWORDS': 35,
 'MAX_AUTHORS': 10,
 'MAX_SUMMARY': 5000,
 'MAX_SUMMARY_SENT': 5,
 'MAX_FILE_MEMO': 20000,
 'memoize_articles': True,
 'fetch_images': True,
 'image_dimension_ration': 1.7777777777777777,
 'follow_meta_refresh': False,
 'use_meta_language': True,
 'keep_article_html': False,
 'http_success_only': True,
 '_language': 'en',
 'stopwords_class': newspaper.text.StopWords,
 'browser_user_agent': 'newspaper/0.2.8',
 'headers': {},
 'request_timeout': 5,
 'proxies': {},
 'number_threads': 10,
 'verbose': False,
 'thread_timeout_seconds': 1}

get_html_2XX_only(url, article.config)

Related issue: https://github.com/psf/requests/issues/1577

Maybe stream=True with stream_timeout can be optional params to download()?

Something like this might work, I monkey patched download():

from newspaper import network
from newspaper.network import _get_html_from_response
import requests
from newspaper.article import ArticleDownloadState

def get_html_2XX_only(url, config=None, response=None, stream=False, stream_timeout=30):
    """Consolidated logic for http requests from newspaper. We handle error cases:
    - Attempt to find encoding of the html by using HTTP header. Fallback to
      'ISO-8859-1' if not provided.
    - Error out if a non 2XX HTTP response code is returned.
    """
    config = config or Configuration()
    useragent = config.browser_user_agent
    timeout = config.request_timeout
    proxies = config.proxies
    headers = config.headers

    if response is not None:
        return _get_html_from_response(response)

    if stream:
        response = requests.get(
            url=url, **get_request_kwargs(timeout, useragent, proxies, headers), stream=True)
        body = []
        start = time.time()
        for chunk in response.iter_content(1024):
            body.append(chunk)
            if time.time() - start > stream_timeout:
                logging.error(f"Stream timed out for url: {url}")
                break
        body = b''.join(body)
        response._content = body
    else:
        response = requests.get(
            url=url, **get_request_kwargs(timeout, useragent, proxies, headers))

    html = _get_html_from_response(response)

    if config.http_success_only:
        # fail if HTTP sends a non 2XX response
        response.raise_for_status()

    return html

def download(self, input_html=None, title=None, recursion_counter=0, stream=False, stream_timeout=30):
    """Downloads the link's HTML content, don't use if you are batch async
    downloading articles

    recursion_counter (currently 1) stops refreshes that are potentially
    infinite
    """
    if input_html is None:
        try:
            html = get_html_2XX_only(self.url, self.config, stream=stream, stream_timeout=stream_timeout)
        except requests.exceptions.RequestException as e:
            self.download_state = ArticleDownloadState.FAILED_RESPONSE
            self.download_exception_msg = str(e)
            log.debug('Download failed on URL %s because of %s' %
                      (self.url, self.download_exception_msg))
            return
    else:
        html = input_html

    if self.config.follow_meta_refresh:
        meta_refresh_url = extract_meta_refresh(html)
        if meta_refresh_url and recursion_counter < 1:
            return self.download(
                input_html=network.get_html(meta_refresh_url),
                recursion_counter=recursion_counter + 1)

    self.set_html(html)
    self.set_title(title)

newspaper.Article.download = download