Closed ogallagher closed 3 years ago
@joshuaye1024 check out data_collector/process/update_db.py to see how the data collector updates the different tables, and as a general guide for how your methods can work.
Note in particular the uses of the trileaf_db.db_client.DBClient.query(sql,args)
method, and how trileaf_db.db_api
will contain constants describing the tables and columns that you’ll be updating.
I’m thinking your insert code will look something like this, once I update trileaf_db.db_api
and the database to include the covid tables.
from typing import Tuple
from pandas import DataFrame
import logging
from logging import Logger
from trileaf_db.db_client import DBClient
from trileaf_db import db_api as api
log:Logger = logging.getLogger(__name__)
def insert_covid_region(dbclient:DBClient, region_id:str) -> bool:
"""Insert new covid region.
Returns:
True if the region was inserted.
"""
inserts:int = dbclient.query(
sql='insert into {t_cr}({id}) values (%s)'.format(
t_cr=api.TABLE_COVID_REGIONS,
id=api.COVID_REGIONS_ID
),
args=region_id
)
if inserts > 0:
log.info('added region {} to the database'.format(region_id))
return True
else:
log.warning('region {} not added to db'.format(region_id))
return False
# end insert_covid_region
# add region stats
def insert_covid_region_stats(dbclient:DBClient,
region_id:str,
region_stats:DataFrame) -> int:
"""Insert new covid region stats.
Args:
dbclient = DBClient instance.
region_id = covid region code (US state/territory abbreviation).
region_stats = DataFrame of covid stats, where each row corresponds to a datetime.
Returns:
inserted = number of new rows inserted into the covid_region_stats table. Ideally, this
is equal to region_stats.shape[0].
"""
# check if region already in database
# exists will be [[1]] if found in db, [] if not
exists:Tuple = dbclient.query(
sql='select 1 from {t_cr} where {id}=%s'.format(
t_cr=api.TABLE_COVID_REGIONS,
id=api.COVID_REGIONS_ID
),
args=region_id
)
if len(exists) == 0:
# region is new; add to database
insert_covid_region(dbclient, region_id)
# else, region already in database
log.debug('region {} in database'.format(region_id))
inserts:int = dbclient.multiquery(
sql='insert into {t_crs}({id},{dt},{pos},{etc}) values (%s,%s,%s)'.format(
t_crs=api.TABLE_COVID_REGION_STATS,
id=api.COVID_REG_STAT_REGION_ID,
dt=api.COVID_REG_STAT_DATETIME,
pos=api.COVID_REG_STAT_POSITIVES,
etc='...'
),
# assumes columns in region_stats align with the order of the column names
# enumerated above
args=region_stats
)
# as an alternative to dbclient.multiquery, dbclient.query with each row in
# region_stats.itertuples() is what I did for data_collector
return inserts
# end insert_covid_region_stats
def main():
# load db credentials from ./res/secrets/db_credentials.txt
db_client.init()
# create connection client
dbclient:DBClient = DBClient()
if dbclient.connected():
# fetch historical region stats from covid api
# this doesn't show how to skip older data that's already collected
region_code:str = 'xx'
region_stats:DataFrame = covid_api_fetch_region_stats(region_code)
inserts:int = insert_covid_region_stats(
dbclient,
region_code,
region_stats
)
log.info('inserted {} new rows for region {}'.format(
inserts,
region_code
))
else:
log.error('failed to connect to trileaf database')
# end main
trileaf-db
into Josh’s environment and ensure it runs