Closed karlicoss closed 1 year ago
Hmm, I guess this should be configurable, right...
Though it does look like all the http URLs (at least in titleUrl
) are google ones:
In [3]: for y in res:
...: if x.titleUrl and x.titleUrl.startswith("http://"):
...: up = urllib.parse.urlsplit(x.titleUrl)
...: x.append(up.netloc)
Counter({'maps.google.com': 1225,
'www.google.com': 1098,
'm.youtube.com': 18,
'productforums.google.com': 17,
'www.youtube.com': 15,
'scholar.google.com': 2,
'books.google.com': 2,
'bp0.blogger.com': 1,
'google.com': 1,
'www.google.co.uk': 1})
Similar results with the internal url
and sourceUrl
from locationInfo
In [8]: Counter(items)
Out[8]: Counter({'www.google.com': 74459, b'': 33304, 'support.google.com': 33245})
dont want just all URLs to be converted (perhaps some websites in your history actually use http....), maybe just the common ones found in exports with a flag to configure this
I guess could make an allowlist and have that in an easy-to-edit place (like http_allowlist.py
, and logger.debug
out all http URLs that dont match, so they're more discoverable/its easy to make a PR here if theres some new google domain thats returning http URLs
That is a non-trivial amount of overhead, but this is all cached anyways, so I think its fine
Edit: just gonna leave this diff here (though it isnt fully correct, need to check netlocs):
diff --git a/google_takeout_parser/models.py b/google_takeout_parser/models.py
index 6e7dc4c..30c58ab 100644
--- a/google_takeout_parser/models.py
+++ b/google_takeout_parser/models.py
@@ -25,6 +25,21 @@ from .common import Res
from .log import logger
+def _make_url_https(url: str) -> str:
+ # I could do this with urlparse... but this is probably fine and is faster
+ if url.startswith("http://"):
+ return "https://" + url[len("http://") :]
+ return url
+
+
+# for type hinting, nice to have distinct types. might be able
+# to do with a TypeVar, but fine for now
+def _make_url_https_opt(url: Optional[str]) -> Optional[str]:
+ if url is None:
+ return None
+ return _make_url_https(url)
+
+
def get_union_args(cls: Any) -> Optional[Tuple[Type]]: # type: ignore[type-arg]
if getattr(cls, "__origin__", None) != Union:
return None
diff --git a/google_takeout_parser/parse_html/activity.py b/google_takeout_parser/parse_html/activity.py
index a2657fb..c05423e 100644
--- a/google_takeout_parser/parse_html/activity.py
+++ b/google_takeout_parser/parse_html/activity.py
@@ -11,7 +11,7 @@ from urllib.parse import urlparse, parse_qs
import bs4
from bs4.element import Tag, PageElement
-from ..models import Activity, Subtitles, LocationInfo
+from ..models import Activity, Subtitles, LocationInfo, _make_url_https_opt
from ..common import Res
from ..log import logger
from .html_time_utils import parse_html_dt
@@ -94,7 +94,9 @@ def _parse_subtitles(
else:
raise RuntimeError(f"Unexpected Type {tag} {type(tag)}")
- parsed_subs.append(Subtitles(name=clean_latin1_chars(buf), url=url))
+ parsed_subs.append(
+ Subtitles(name=clean_latin1_chars(buf), url=_make_url_https_opt(url))
+ )
return parsed_subs, parse_html_dt(dt_raw, file_dt=file_dt)
@@ -239,9 +241,9 @@ def _parse_caption(
locationInfos.append(
LocationInfo(
name=name,
- url=url,
+ url=_make_url_https_opt(url),
source=source,
- sourceUrl=sourceUrl,
+ sourceUrl=_make_url_https_opt(sourceUrl),
)
)
elif header == "Details:":
@@ -318,8 +320,8 @@ def _parse_activity_div(
return Activity(
header=header,
- title=title_info[0],
- titleUrl=title_info[1], # could be None, matched by model
+ title=title_info.name,
+ titleUrl=_make_url_https_opt(title_info.url), # could be None, matched by model
description=None, # always none since we can't differentiate in HTML parsing
time=dtime,
locationInfos=locationInfos,
diff --git a/google_takeout_parser/parse_json.py b/google_takeout_parser/parse_json.py
index 9fd4972..6e672e4 100644
--- a/google_takeout_parser/parse_json.py
+++ b/google_takeout_parser/parse_json.py
@@ -18,6 +18,8 @@ from .models import (
Location,
PlaceVisit,
CandidateLocation,
+ _make_url_https,
+ _make_url_https_opt,
)
from .common import Res
from .time_utils import parse_json_utc_date
@@ -39,7 +41,9 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
# sometimes it's just empty ("My Activity/Assistant" data circa 2018)
if "name" not in s:
continue
- subtitles.append(Subtitles(name=s["name"], url=s.get("url")))
+ subtitles.append(
+ Subtitles(name=s["name"], url=_make_url_https_opt(s.get("url")))
+ )
# till at least 2017
old_format = "snippet" in blob
@@ -54,7 +58,7 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
yield Activity(
header=header,
title=blob["title"],
- titleUrl=blob.get("titleUrl"),
+ titleUrl=_make_url_https_opt(blob.get("titleUrl")),
description=blob.get("description"),
time=parse_json_utc_date(time_str),
subtitles=subtitles,
@@ -66,9 +70,9 @@ def _parse_json_activity(p: Path) -> Iterator[Res[Activity]]:
locationInfos=[
LocationInfo(
name=locinfo.get("name"),
- url=locinfo.get("url"),
+ url=_make_url_https_opt(locinfo.get("url")),
source=locinfo.get("source"),
- sourceUrl=locinfo.get("sourceUrl"),
+ sourceUrl=_make_url_https_opt(locinfo.get("sourceUrl")),
)
for locinfo in blob.get("locationInfos", [])
],
@@ -215,7 +219,7 @@ def _parse_chrome_history(p: Path) -> Iterator[Res[ChromeHistory]]:
time_naive = datetime.utcfromtimestamp(item["time_usec"] / 10**6)
yield ChromeHistory(
title=item["title"],
- url=item["url"],
+ url=_make_url_https(item["url"]),
dt=time_naive.replace(tzinfo=timezone.utc),
)
except Exception as e:
It might make sense to replace
http://
withhttps://
for some links, e.g. to youtube videos.For instance, in
Takeout/My Activity/Video Search/MyActivity.{json,html}
might contain http:// links for some old entriesIn case of youtube, switching to
https
doesn't really hurt (the http/https are equivalent and both are availabe), and it might make it easier to consume downstream, e.g. might prevent duplicates.zulip discussion: https://memex.zulipchat.com/#narrow/stream/279601-hpi/topic/google_takeout_parser/near/279605540