Open murnanedaniel opened 1 year ago
problem described here already: #53 but I cant get it to work.
orig_url = requests.get(get_news()['url']).url
can you try that
I've done something like this if it helps anyoane . I've found the answer on stack overflow :
# Ref: https://stackoverflow.com/a/59023463/
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_PREFIX_WITH_CONSENT = "https://consent.google.com/m?continue=https://news.google.com/rss/articles/"
_ENCODED_URL_RE = re.compile(fr"^{re.escape(_ENCODED_URL_PREFIX_WITH_CONSENT)}(?P<encoded_url>[^?]+)")
_ENCODED_URL_RE = re.compile(fr"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)")
_DECODED_URL_RE = re.compile(rb'^\x08\x13".+?(?P<primary_url>http[^\xd2]+)\xd2\x01')
@functools.lru_cache(2048)
def _decode_google_news_url(url: str) -> str:
match = _ENCODED_URL_RE.match(url)
encoded_text = match.groupdict()["encoded_url"] # type: ignore
encoded_text += "===" # Fix incorrect padding. Ref: https://stackoverflow.com/a/49459036/
decoded_text = base64.urlsafe_b64decode(encoded_text)
match = _DECODED_URL_RE.match(decoded_text)
print (match)
primary_url = match.groupdict()["primary_url"] # type: ignore
primary_url = primary_url.decode()
return primary_url
def decode_google_news_url(url: str) -> str:
return _decode_google_news_url(url) if url.startswith(_ENCODED_URL_PREFIX) else url
Running
gives results such as
Where the URL now directs to a cookie consent screen:
Is there a way to consent to the cookies somehow?