broken? news gives me x times the same entry

trueToastedCode commented 1 year ago

  i = Investing('http://investing.com/economic-calendar/')
  news = i.news()

{'timestamp': <bound method Arrow.timestamp of <Arrow [2022-12-01T23:30:00+00:00]>>, 'country': 'New Zealand', 'impact': 3, 'url': 'http://investing.com/economic-calendar//economic-calendar/rbnz-gov-orr-speaks-1776', 'name': 'RBNZ Gov Orr Speaks', 'bold': '', 'fore': '', 'prev': '', 'signal': <Unknow(value='?')>, 'type': 'speech'}
{'timestamp': <bound method Arrow.timestamp of <Arrow [2022-12-01T23:30:00+00:00]>>, 'country': 'New Zealand', 'impact': 3, 'url': 'http://investing.com/economic-calendar//economic-calendar/rbnz-gov-orr-speaks-1776', 'name': 'RBNZ Gov Orr Speaks', 'bold': '', 'fore': '', 'prev': '', 'signal': <Unknow(value='?')>, 'type': 'speech'}
{'timestamp': <bound method Arrow.timestamp of <Arrow [2022-12-01T23:30:00+00:00]>>, 'country': 'New Zealand', 'impact': 3, 'url': 'http://investing.com/economic-calendar//economic-calendar/rbnz-gov-orr-speaks-1776', 'name': 'RBNZ Gov Orr Speaks', 'bold': '', 'fore': '', 'prev': '', 'signal': <Unknow(value='?')>, 'type': 'speech'}

and so on...

trueToastedCode commented 1 year ago

I've written a new working parser. I left that Good/Bad thing. But I will parse everything into Python objects and even convert the time into UTC based on what offset the site provides.

import bs4
import re
import urllib
import urllib.request
import string
from urllib.error import HTTPError
from datetime import datetime, timedelta

def parse_investing_number(numer_s):
    if not numer_s:
        return None
    if numer_s[-1] not in string.digits:
        numer_s = numer_s[:-1]
    numer_s = numer_s.replace(',', '.')
    i = numer_s.rfind('.')
    if i == -1:
        return float(numer_s)
    numer_s = numer_s[:i].replace('.', '') + numer_s[i:]
    return float(numer_s)

def parse_time_offset(offset):
    r = re.search(r'(\+|-)\d?\d:\d\d', offset)
    if not r:
        raise Exception(f'Could not parse time offset: {offset}')
    r = r.group()
    i = r.find(':')
    return int(r[:i]), int(r[i + 1:])

def get_time_offset(soup):
    offset_tag = soup.find('span', {'id': 'timeZoneGmtOffsetFormatted'})
    if not offset_tag:
        raise Exception('Could not find time tag')
    offset = offset_tag.text.strip()
    return parse_time_offset(offset)

def get_news(soup, time_offset):
    news_tags = soup.find_all('tr', {'id': re.compile(r'eventRowId_\d+')})
    if not news_tags:
        raise Exception('Cannot find news tags')
    return [parse_news_tag(news_tag, time_offset) for news_tag in news_tags ]

def parse_news_tag(news_tag, time_offset):
    _datetime = news_tag['data-event-datetime']
    if not _datetime:
        raise Exception('Cannot find datetime')
    _datetime = datetime.strptime(_datetime, '%Y/%m/%d %H:%M:%S')
    _datetime += timedelta(hours=time_offset[0] * -1, minutes=time_offset[1] * -1)

    country_tag = news_tag.find('td', {'class': 'flagCur'})
    if not country_tag:
        raise Exception('Cannot find country tag')
    country = country_tag.text.strip()

    actual_tag = news_tag.find('td', {'id': re.compile(r'eventActual_\d+')})
    if not actual_tag:
        raise Exception('Cannot find actual tag')
    actual_s = actual_tag.text.strip() or None
    actual = parse_investing_number(actual_s)

    forecast_tag = news_tag.find('td', {'id': re.compile(r'eventForecast_\d+')})
    if not forecast_tag:
        raise Exception('Cannot find forecast tag')
    forecast_s = forecast_tag.text.strip() or None
    forecast = parse_investing_number(forecast_s)

    previous_tag = news_tag.find('td', {'id': re.compile(r'eventPrevious_\d+')})
    if not previous_tag:
        raise Exception('Cannot find previous tag')
    previous_s = previous_tag.text.strip() or None
    previous = parse_investing_number(previous_s)

    title_tag = news_tag.find('td', {'title': re.compile(r'(.|\s)*\S(.|\s)*'), 'class': 'event'})
    if not title_tag:
        raise Exception('Cannot find title tag')
    title = title_tag.text.strip()

    relevance_tags_full = news_tag.find_all('i', {'class': 'grayFullBullishIcon'})
    relevance_tags_empty = news_tag.find_all('i', {'class': 'grayEmptyBullishIcon'})
    if len(relevance_tags_full) + len(relevance_tags_empty) != 3:
        raise Exception('Cannot find relevance tags')
    relevance = len(relevance_tags_full)

    return News(_datetime, country, actual, actual_s, forecast, forecast_s, previous, previous_s, title, relevance)

class News:
    def __str__(self):
        return f'[ {self._datetime} ] [ {self.country} ] [ {self.relevance}/3 ] {self.title}: ' \
               f'Actual: {self.actual_s}, Forecast: {self.forecast_s}, Previous {self.previous_s}'

    def __init__(self, _datetime, country, actual, actual_s, forecast, forecast_s, previous, previous_s, title,
                 relevance):
        self._datetime = _datetime
        self.country = country
        self.actual = actual
        self.actual_s = actual_s
        self.forecast = forecast
        self.forecast_s = forecast_s
        self.previous = previous
        self.previous_s = previous_s
        self.title = title
        self.relevance = relevance

class Investing:

    def __init__(self, uri='https://www.investing.com/economic-calendar/'):
        self.uri = uri
        self.req = urllib.request.Request(uri)
        self.req.add_header('User-Agent',
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
        self.results = []

    def news(self):
        try:
            response = urllib.request.urlopen(self.req)
            html = response.read()
            soup = bs4.BeautifulSoup(html, 'html.parser')
            time_offset = get_time_offset(soup)
            self.results = get_news(soup, time_offset)
            return self.results
        except HTTPError as error:
            print("Oops... Get error HTTP {}".format(error.code))

    def get_by_title(self, title):
        title = title.lower()
        for news in self.results:
            if news.title.lower() == title:
                return news

princefishthrower commented 1 year ago

Ran into the same issue, the original bug was that he was overwritting the 'news' entry so you would only keep the first one in the table for n number of times.

rcyost commented 11 months ago

Thank you for correcting the code. How do you point this to specific calendars? The url doesn't change the app on the page. Thanks!

trueToastedCode commented 11 months ago

Thank you for correcting the code. How do you point this to specific calendars? The url doesn't change the app on the page. Thanks!

This will only work on the economic-calendar, since the other ones have different structures.

You could change the link, when initializing Investing. Investing(uri='https://hello-world.com/')

freenetwork / investing.com.economic-calendar

broken? news gives me x times the same entry #6