Closed carllcchen closed 3 years ago
import json from datetime import datetime from pytz import timezone from urllib.request import urlopen import sys import csv from io import StringIO
def load_json(): url = (" https://services6.arcgis.com/KaHXE9OkiB9e63uE/ArcGIS/rest/services/COVID19_Long_Term_Care_Facility_Impacts/FeatureServer/273/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token= ") response = urlopen(url) status_code = response.getcode() if status_code == 200: source = response.read() dict_data = json.loads(source) else: print("URL request failed with status code: " + status_code) sys.exit(1)
if not dict_data is None:
features_list = dict_data["features"]
si = StringIO()
fieldnames=["Date Collected", "State", "blank1",
"blank2", "Facility_Name", "Facility_Type", "blank3",
"blank4", "blank5", "blank6", "blank7", "blank8",
"Resolved_Y_N", "blank9", "Postive_Patients_Desc", "blank10", "blank11", "blank12", "blank13", "blank14", "blank15", "blank16", "Unresolved_Postive_Patients_Desc", "blank17", "blank18", "blank19", "blank20", "blank21", "blank22", "blank23", "Dashboard_Display"] writer = csv.DictWriter(si, fieldnames)
writer.writeheader()
for item in features_list:
row_data = '{"Date Collected": '
row_data +=
datetime.now(timezone('US/Eastern')).strftime('%Y%m%d') row_data += ', "State": "UT", "blank1": "", "blank2": "", "Facility_Name": "' row_data += (item["attributes"]["Facility_Name"])
row_data += '","Facility_Type": "'
row_data += item["attributes"]["Facility_Type"]
row_data += '", "blank3":"", "blank4": "", "blank5": "",
"blank6": "", "blank7":"", "blank8": ""' row_data += ', "Resolved_Y_N": "' row_data += item["attributes"]["Resolved_Y_N"]
row_data += '", "blank9": "", "Postive_Patients_Desc": "'
if item["attributes"]["Postive_Patients_Desc"] == "No Resident
Cases": row_data += '0' else: row_data += item["attributes"]["Postive_Patients_Desc"]
row_data += '",
"blank10":"","blank11":"","blank12":"","blank13":"","blank14":"","blank15":"","blank16": ""' row_data += ', "Unresolved_Postive_Patients_Desc": "' if item["attributes"]["Resolved_Y_N"] == "N": row_data += item["attributes"]["Postive_Patients_Desc"] else: row_data += '' row_data += '", "blank17":"","blank18":"","blank19": "", "blank20":"", "blank21":"", "blank22":"","blank23":""' row_data += ', "Dashboard_Display": "' row_data += item["attributes"]["Dashboard_Display"]
row_data += '"}'
row_dict = json.loads(row_data)
writer.writerow(row_dict)
print(si.getvalue())
On Tue, Jan 12, 2021 at 2:56 PM Zach Lipton notifications@github.com wrote:
@zachlipton requested changes on this pull request.
Thanks for this! A few comments inline. Sorry if this seems like a lot or if anything is unclear. By all means shoot me a message if anything doesn't make sense or you're not sure how to do something and I'll do my best to help.
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556134879 :
@@ -0,0 +1,47 @@ +import json +from datetime import datetime +from pytz import timezone +from urllib.request import urlopen + +def load_json():
- url = ("https://services6.arcgis.com/KaHXE9OkiB9e63uE/ArcGIS/rest/services/COVID19_Long_Term_Care_Facility_Impacts/FeatureServer/273/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token=")
- response = urlopen(url)
- if response.getcode() == 200:
- source = response.read()
- dict_data = json.loads(source)
- else:
- print("Error")
If this fails for any reason, let's print an error message with the error code and then sys.exit(1) or some other non-zero return code so that the script terminates and automation knows the script failed and can report the error
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556135010 :
+from pytz import timezone +from urllib.request import urlopen + +def load_json():
- url = ("https://services6.arcgis.com/KaHXE9OkiB9e63uE/ArcGIS/rest/services/COVID19_Long_Term_Care_Facility_Impacts/FeatureServer/273/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token=")
- response = urlopen(url)
- if response.getcode() == 200:
- source = response.read()
- dict_data = json.loads(source)
- else:
- print("Error")
print(response.read())
data = json.loads(response.read())
f = open("Facilities_with_Active_Outbreaks.json", "r")
data = f.read()
Remove unnecessary comments please
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556137965 :
+def load_json():
- url = ("https://services6.arcgis.com/KaHXE9OkiB9e63uE/ArcGIS/rest/services/COVID19_Long_Term_Care_Facility_Impacts/FeatureServer/273/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token=")
- response = urlopen(url)
- if response.getcode() == 200:
- source = response.read()
- dict_data = json.loads(source)
- else:
- print("Error")
print(response.read())
data = json.loads(response.read())
f = open("Facilities_with_Active_Outbreaks.json", "r")
data = f.read()
- CSV_file = open("CSV_Data_Extract.csv", "w")
- if not dict_data is None:
I don't think load_json() is being called anywhere? So dict_data has no data here. Maybe make load_json() return the dict and then use it here?
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556138494 :
- dict_data = json.loads(source)
- else:
- print("Error")
print(response.read())
data = json.loads(response.read())
f = open("Facilities_with_Active_Outbreaks.json", "r")
data = f.read()
- CSV_file = open("CSV_Data_Extract.csv", "w")
- if not dict_data is None:
- features_list = dict_data["features"]
- for item in features_list:
- fmt = datetime.today().strftime('%Y%m%d')
- now_time = datetime.now(timezone('US/Eastern'))
- line_data = now_time.strftime(fmt)
I think this can be simplified to datetime.now(timezone('US/Eastern')).strftime('%Y%m%d')
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556139318 :
+from urllib.request import urlopen + +def load_json():
- url = ("https://services6.arcgis.com/KaHXE9OkiB9e63uE/ArcGIS/rest/services/COVID19_Long_Term_Care_Facility_Impacts/FeatureServer/273/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=*&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=true&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pjson&token=")
- response = urlopen(url)
- if response.getcode() == 200:
- source = response.read()
- dict_data = json.loads(source)
- else:
- print("Error")
print(response.read())
data = json.loads(response.read())
f = open("Facilities_with_Active_Outbreaks.json", "r")
data = f.read()
- CSV_file = open("CSV_Data_Extract.csv", "w")
Instead of writing the output to a file, please output it to STDOUT when the script finishes (just a print statement is fine). This will help us hook it up to GH Actions.
In data-collection-scripts/ltc-scraper/Utah_Data_Extract.py https://github.com/COVID19Tracking/covid-tracking-data/pull/220#discussion_r556146380 :
- else:
- print("Error")
print(response.read())
data = json.loads(response.read())
f = open("Facilities_with_Active_Outbreaks.json", "r")
data = f.read()
- CSV_file = open("CSV_Data_Extract.csv", "w")
- if not dict_data is None:
- features_list = dict_data["features"]
- for item in features_list:
- fmt = datetime.today().strftime('%Y%m%d')
- now_time = datetime.now(timezone('US/Eastern'))
- line_data = now_time.strftime(fmt)
- line_data += "," + "UT"
Please don't output CSV data by trying to build it up from scratch this way. This will break the file format if any of the data fields contains a comma, newline, or anything else that needs to be escaped. The easiest thing to do is to transform the data into a Python array-of-dictionaries and use Python's csv library https://realpython.com/python-csv/#writing-csv-file-from-a-dictionary-with-csv to write it out, which will take care of outputting a header with the column names, formatting everything, and escaping the data safely for you. Something more-or-less like (untested):
writer = csv.writer(si, fieldnames=["Date Collected ", "State", "blank"]) writer.writeheader() writer.writerow({'Date Collected': '20210112', 'State': 'UT', 'blank': ''}) output = si.getvalue() print(output)
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/COVID19Tracking/covid-tracking-data/pull/220#pullrequestreview-566750725, or unsubscribe https://github.com/notifications/unsubscribe-auth/AHJYINUQG6D5D2PELYIUYQTSZTHR5ANCNFSM4V72I57A .
Utah Data Extraction from Utah Daily Dashboard