Open simonw opened 2 years ago
Scraped USGS earthquakes:
git clone https://github.com/simonw/usgs-scraper
cd usgs-scraper
git-history file usgs.db usgs.json --id id --convert '
features = json.loads(content)["features"]
for feature in features:
item = {"id": feature["id"]}
item.update(feature["properties"])
item["latitude"] = feature["geometry"]["coordinates"][1]
item["longitude"] = feature["geometry"]["coordinates"][0]
yield item
'
https://github.com/adolph/getIncidentsGit Hoston incidents. Record looks like this:
{
"ActiveIncidentDataTable": [
{
"Agency": "F",
"Address": "WESTLAKE PARK BLVD",
"CrossStreet": "BLK GRISBY RD",
"KeyMap": "488B",
"XCoord": "-95630227",
"YCoord": "29783580",
"CombinedResponse": "F",
"CallTimeOpened": "12/06/2021 17:42",
"IncidentType": "Automatic Alarm",
"AlarmLevel": "0",
"NumberOfUnits": "1",
"Units": "E078;"
}
]
So --convert
of:
incidents = json.loads(content)["ActiveIncidentDataTable"]
for incident in incidents:
x = incident.pop("XCoord")
y = incident.pop("YCoord")
incident["Latitude"] = float(y) / 1000000 if y else None
incident["Longitude"] = float(x) / 1000000 if x else None
call_time = incident.pop("CallTimeOpened")
date, time = call_time.split(" ")
mm, dd, yyyy = date.split("/")
incident["CallTimeOpened"] = "{}-{}-{} {}".format(yyyy, mm, dd, time)
incident["Units"] = [unit for unit in incident["Units"].split(";") if unit]
yield incident
So the recipe is:
git-history file incidents.db incidents.json --convert '
incidents = json.loads(content)["ActiveIncidentDataTable"]
for incident in incidents:
x = incident.pop("XCoord")
y = incident.pop("YCoord")
incident["Latitude"] = float(y) / 1000000 if y else None
incident["Longitude"] = float(x) / 1000000 if x else None
call_time = incident.pop("CallTimeOpened")
date, time = call_time.split(" ")
mm, dd, yyyy = date.split("/")
incident["CallTimeOpened"] = "{}/{}/{} {}".format(yyyy, mm, dd, time)
incident["Units"] = [unit for unit in incident["Units"].split(";") if unit]
yield incident
' --id Address --id CrossStreet --id CallTimeOpened --ignore-duplicate-ids
FARA from https://github.com/simonw/fara-history
git-history file fara.db \
../fara-history/FARA_All_Registrants.csv \
--repo ../fara-history --id "Registration_Number" \
--branch master --convert '
decoded = content.decode("utf-8")
reader = csv.DictReader(io.StringIO(decoded), dialect="excel")
for row in reader:
yield dict((key.replace(" ", "_"), value) for key, value in row.items())
' --import io --import csv --ignore-duplicate-ids
https://github.com/mmarvick/scrape-seattle-city-light-outages
git-history file seattle-city-light.db outages.json \
--id id --branch master
https://github.com/simonw/package-stats
git-history file stats.db stats.json --convert '
data = json.loads(content)
for key, counts in data.items():
for date, count in counts.items():
yield {
"package": key,
"date": date,
"count": count
}
' --id package --id date
And for releases in that same repo:
git-history file stats.db datasette-app-releases.json \
--namespace releases \
--convert '
data = json.loads(content)
if isinstance(data, dict) and data.get("message"):
# Probably a rate-limiting error
return
for row in json.loads(content):
version = row["tag_name"]
for asset in row["assets"]:
yield {
"version": version,
"asset": asset["name"],
"download_count": asset["download_count"]
}
' --id version --id asset
https://github.com/abkfenris/sugarloaf looks like a fun demo for CSV data.
There's a wealth of interesting data in https://github.com/nychealth/coronavirus-data/tree/master/latest
See also https://observablehq.com/@bmschmidt/nyc-test-positivity-december-2021 by https://twitter.com/benmschmidt/status/1473682517312131082
Using this thread to collect them - not yet decided if I'll turn these into live demos or just have a piece of documentation suggesting people try these out.