degauss-org / dht

DeGAUSS helper tools
http://degauss.org/dht
GNU General Public License v3.0
4 stars 1 forks source link

updating institutional addresses #97

Closed erikarasnick closed 1 year ago

erikarasnick commented 1 year ago

3333 Burnett Ave CINCINNATI Ohio 45229 is geocoded to Burnet Ave Miami Heights OH 45229

Should we add this Burnett misspelling to be filtered out as an institutional address?

erikarasnick commented 1 year ago

also fix Ronald McDonald address

should be 341 not 350 ?

cole-brokamp commented 1 year ago

Here is the list Stu sent by email: JFS_Offices.xlsx

and the R script: ```r # Geocode HealthVine addresses # Get HV population library(DBI) library(glue) library(tidyverse) library(janitor) caboodle_con <- dbConnect(odbc::odbc(),"CaboodleProd") spellings_of_cincinnati<-c( 'CICINNATI' ,'CINAINNCTI' ,'CINCI' ,'CINCINANTI' ,'CINCINATI' ,'CINCINATTI' ,'CINCINNAI' ,'CINCINNAIT' ,'CINCINNAT' ,'CINCINNATIA' ,'CINCINNTI' ,'CINCNNATI' ,'CINTI' ,'CIONCINNATI' ,'CNCINNATI' ) spellings_of_liberty_township<-c( 'LIBERTY TOWNSHIP', 'LIBERTY TOWNSHP' ,'LIBERTY TWNSHP' ,'LIBERTY TWP' ) # Regex of needed JFS and other institutional addresses institutional_regex_exp<-tribble( ~label,~regex_exp, 'Adams_JFS','482 RICE DR.*456', 'Adams_JFS','300 N[ORTH\\.]* WI.* 456', 'Allen_JFS', '123 W[\\.EST]* SP.* 458', 'Allen_JFS', '951 COM.* P[ARKWY]+.* 458', 'Brown_JFS','775 M[\\.OUNT]* *O[RABN]* .*451', 'Butler_JFS','300 [NM][ORTH\\.]*.* FAIR.* 450', 'Butler_JFS','315 HIG.* S.* 450', 'Clark_JFS', '1345 LAG.* A.* 455', 'Clermont_JFS','2400 +CL[AIERMOUNT]+ .*(CEN[TER]+|D[RIVE]+|BL).* OH.* 451', 'Clinton_JFS','1025 S[OUTH\\.]* *S[\\.OUTH]*.* 451', 'Delaware_JFS', '145 N[\\.ORTH]* UN.* 430', 'Fairfield_JFS', '239 W[\\.EST]* MA.* S.* 431', 'Franklin_JFS', '855 W[\\.EST]* MO.* S.* 432', 'Fayette_JFS', '133 S[\\.OUTH]* M.* W.*C.*H.* 431', 'Gallia_JFS','848 (THIRD|3RD) A.* 456', 'Greene_JFS','(601|541) LE[DBETR]* .*453', 'Hamilton_JFS','222 (E[\\.ASTY ]*)*C[ENTRALUMCY]+ *P[ARKWYU ]+.*452', 'Hardin_JFS', '175 W[\\.EST]* FRANK.* S.* 433', 'Highland_JFS','1575 N[ORTH\\.]* HIGH.*451', 'Licking_JFS','74 S[OUTH\\.]* (SEC|2ND).* 430', 'Lorain_JFS', '226 MID.* 440', 'Lorain_JFS', '42485 N[\\.ORTH]* RI.* R.* 440', 'Lucas_JFS', '3737 SYLV.* A.* 436', 'Lucas_JFS', '705 ADA.* S.* 436', 'Montegomery_JFS','3304 N[ORTHJ\\.]* *M[AIN]* .*454', 'Montegomery_JFS','1111 S[OUTH\\.]* ED.*M.*454', 'Pike_JFS','525 WALN.* S.* 456', 'Preble_JFS','1500 PARK AV.*453', 'Richland_JFS','731 SCHOL+ R.* 449', 'Richland_JFS','171 PARK A.* 449', 'Ross_JFS','475 WESTERN A.* 456', 'Scioto_JFS', '710 C[OURT]+ S.*456', 'Scioto_JFS', '3940 G[ALI]+ S.*456', 'Seneca_JFS', '900 E[AST\\.]* C.* R.* 448', 'Summit_JFS', '1180 S[\\.OUTH]* MA.* 443', 'Summit_JFS', '264 S[\\.OUTH]* ARLI.* 443', 'Union_JFS', '940 LON.* A.* 430', 'Warren_JFS','416 S[\\.OUTH]* *E[\\.AST]* .*450', 'Warren_JFS','2282 REEVES R.* 444', 'Indiana_JFS','230 ((MARY)|(MERRY)) ((AV)|(ST)).*470', 'Indiana_JFS','12048 S[AINT]+ MARYS* .*47', 'Indiana_JFS','125 N[ORTH\\.]* WALNUT.*470', 'Indiana_JFS','506 FERRY ST[RETS56 ]* .*470', 'Kentucky_JFS','601 W[ASHINGTO]*N A.*NEW.*410', 'Kentucky_JFS','130 W.*43.*ST.*COVING.*410', 'Kentucky_JFS','8311 U *S.*42.*KENTUCKY 410', 'JFS','J[OBS ]*(AND )*F[AMILYES ]*S', 'JFS','PROTECTIVE', 'JFS','DCHS', 'JFS','MCCS', 'JFS','HUMAN SER', 'JFS','SERVICES', 'JFS','CHILD.*(PROTECT|SERV)', 'Ronald_McDonald','(350 E[RKIU]+[RKEINBCGH]+.*45[0-9]+9)|(RONALD.*HOUSE)', 'CCHMC','3333 BUR[NET]+.*A.* 45', 'CCHMC_Liberty','7777 YANK.* 4504', 'St_Joes','(10722\\sWYS.*OH)|(S[AIN]*T\\sJO[SEPH]* (OR(PHANGE)*|HOME))', 'Bethany_House', '(1836 FAIR *M[OU]+NT)|(BETHANY HOUSE).* 4521', 'Adoption_professionals','2758 ERIE AVE.* 45208' ) # Some addrresses for JFS offices that don't match to regex join -- see note at bottom of # script for how these are identified not_regex_match_jfs<-tribble( ~label,~full_address_clean, 'Adams_JFS','482 RICE DRIVE WEST UNION OH', 'Adams_JFS','482 RICE DRIVE WEST UNION OH 43358', 'Butler_JFS','3000 NORTH FAIR HAMILTON OH 45011', 'Butler_JFS','3000 N FAIR AVE HAMILTON OH 45011', 'Butler_JFS','300 N FAIR AVE HAMILTON OH 45202', 'Clinton_JFS','125 S SOUTH ST WILMINGTON OH 45177', 'Hamilton_JFS','222 E CENTRAL CINCINNATI OH 45202', 'Hamilton_JFS','22 E CENTRAL PKWY CINCINNATI OH 45202', 'Hamilton_JFS','222 E CENTRAL PKWY CINCINNATI OH 25402', 'Hamilton_JFS','222 W CENTRAL PKWY CINCINNATI OH 45202', 'Hamilton_JFS','222 EAST CENTRAL CINCINNATI OH 45202' ) hv_addresses<-dbGetQuery(caboodle_con,statement = " with pat_list as ( select distinct dm.PatientDurableKey from HlthvnCsHstRegRegistryDataMartX dm ) select ad.Address, ad.City, ad.StateOrProvinceAbbreviation, ad.PostalCode, ad.AddressKey from PatientDim pd inner join pat_list pl on pd.DurableKey=pl.PatientDurableKey inner join AddressDim ad on pd.AddressKey=ad.AddressKey where ad.AddressKey>0 ") # bring in addresses and manipulate ceratin subsets ---- hv_addresses_processed<-hv_addresses %>% unique() %>% mutate(address_clean=toupper(Address), address_clean=gsub('(^(BLDG|LOT|APT|UNIT |#) *#*\\s*[A-Z0-9 ]* )(?=([0-9]*\\s[A-Z]*))','',address_clean,perl=TRUE), # some apartments at start of address address_clean=gsub(' WN BEND',' WNORTH BEND',address_clean), # commonly mistyped address, for some reason WNorth works better in geocoder than W North address_clean=gsub('W(EST)* *NORTH *BEND','WNORTH BEND',address_clean), # commonly mistyped address, for some reason WNorth works better in geocoder than W North address_clean=gsub(' ST (?=(LEGER|LAWRENCE|ALBANS|LEO|CLAIR|MICHAEL|WILLIAM|DOMINIC|CHARLES|JAMES|JOHN|ELMO|MATTHEW))',' SAINT ',address_clean,perl=TRUE), # ST abbreviation is tripping up geocoder address_clean=gsub(' SAINT (?=(ANDREW))',' ST ',address_clean,perl=TRUE), # ST ANDREW is better than SAINT ANDREW for some reason abbreviation is tripping up geocoder address_clean=gsub('1/2','',address_clean), # removes 1/2 from addresses address_clean=sub('(\\d+?)(?:\\-\\d+)','\\1',address_clean), #removes street_num-apt_num constructs address_clean=gsub('[^A-Z0-9 #]',' ',address_clean), #helpful to get other characters out at this point po_box_flag=str_detect(address_clean,'P *O *BOX'), # mark addresses as containing a PO Box, may want to treat these differently address_clean=sub('(?<=.)P\\s*O\\s*BOX[\\s#]*[0-9]+\\b','',address_clean,perl=TRUE), #removes PO BOX is something precedes it address_clean=sub('(^P\\s*O\\s*BOX\\s*[0-9]+ )(.*)','\\2',address_clean), # removes PO Box is something after it address_clean=gsub('(?<=[0-9] )((APT|UNIT |#) *#*\\s*[A-Z0-9 ]* )(?=[A-Z])','',address_clean,perl=TRUE), # some apartments at start of address address_clean=gsub('( (APT|APARTMENT|UNIT |B(UI)*LD(IN)*G|#|SUITE).*\\b|\\sS*LOT\\.*#*(\\s*\\d+|\\s+\\S)\\w*\\b)','',address_clean), # get rid of unit designations address_clean=sub('(ST RO*U*TE*)(?= \\d)','STATE ROUTE',address_clean,perl=TRUE), # "State Route addresses cause problems when they have ST RT so replacing address_clean=gsub('#',' ',address_clean), # get rid of # at this point address_clean=str_squish(address_clean), address_clean=gsub('PO BOX$','',address_clean,perl=TRUE), #an address ends with just PO BOX, removing that box label address_clean=sub('^([0-9]*)([A-Z])','\\1 \\2',address_clean,perl=TRUE), # some addresses have a letter after the number just putting in a space city_clean=str_squish(toupper(City)), city_clean=if_else(city_clean %in% spellings_of_cincinnati,'CINCINNATI',city_clean), city_clean=if_else(city_clean %in% spellings_of_liberty_township,'LIBERTY',city_clean), city_clean=gsub(' TOWNSHIP','',city_clean), # this one helps geocoding in our region, but might be good to validate its impact city_clean=sub('MT\\.* *(?=[A-Z])','MOUNT ',city_clean,perl = TRUE), # MOUNT works better than MT in our region for city names city_clean=gsub('[^A-Z0-9 #-]',' ',city_clean), #get punctuation out of city zip_clean=substr(trim(PostalCode),1,5), state_clean=gsub('\\*Unspecified','',StateOrProvinceAbbreviation,perl = TRUE), # Some corrections from over the years address_clean=if_else(city_clean=='COVINGTON',gsub('SAINT CLAIRE','ST CLAIRE',address_clean),address_clean), #covington's Saint Clair matches better to St Clair address_clean=if_else(zip_clean=='45211',gsub('SAINT LEO','ST LEO',address_clean),address_clean), #there is a st leo listed in 45211 in geocoder address_clean=if_else(zip_clean=='45231',gsub(' CLOVERKNOLL',' CLOVERNOLL',address_clean),address_clean), #Clovernoll is misspelt and geocoder doesn't resolve address_clean=if_else(zip_clean=='45229',gsub(' GOSHEN ',' GHOLSON ',address_clean),address_clean), #GHOLSON AVE in 45229 gets misspelt GOSHEN address_clean=if_else(zip_clean=='45251',gsub(' ME[MN]O[MN]INEE ',' MENOMINEE ',address_clean),address_clean), #Menominee Dr gets various m/n combinations address_clean=if_else(zip_clean=='45215',gsub(' WALBASH ',' WABASH ',address_clean),address_clean), #Someone added in an 'L' to Wabash address_clean=if_else(zip_clean=='45232',gsub(' DUCTH C',' DUTCH C',address_clean),address_clean), #Mispelling of Dutch address_clean=if_else(zip_clean=='45237',gsub(' RHODE ALLEN',' RHODE ISLAND',address_clean),address_clean), #Someone typed in "Allen" instead of "Island" most likely address_clean=if_else(zip_clean=='45238',gsub(' E TOWERS',' E TOWER',address_clean),address_clean), #The extra s throws off the geocoder address_clean=if_else(zip_clean=='45238',gsub(' FOUR TOWER ',' FOUR TOWERS ',address_clean),address_clean), #Needs the s to match properly address_clean=if_else(zip_clean=='45239',gsub(' LABOITEAUS ',' LA BOITEAUX ',address_clean),address_clean), #an address that is consistently spelt wrong address_clean=if_else(zip_clean=='45239',gsub(' EASTKNOLL ',' E KNOLL ',address_clean),address_clean), #an address that is consistently spelt wrong address_clean=if_else(zip_clean=='45231',gsub(' LAKEKNOLL ',' LAKENOLL ',address_clean),address_clean), #an address that is consistently spelt wrong address_clean=if_else(zip_clean=='45231',gsub(' LAKE KNOLL ',' LAKENOLL ',address_clean),address_clean), #an address that is consistently spelt wrong zip_clean=if_else((str_detect(address_clean,'21\\d+ LOTH ST') & zip_clean=='45219'),'45202',zip_clean), #21st block of Loth is in 45202 in geocoder some family members where geocoded and others not because incorrect zip begins_with_number_and_space=str_detect(address_clean,'^[0-9]+ [0-9A-Z]'), street_number=str_extract(address_clean,'^[0-9]+'), street_name=str_extract(address_clean,'[A-Z].*'), full_address_clean=paste(address_clean,city_clean,state_clean,zip_clean), full_address_clean=str_squish(full_address_clean) ) %>% unique() # flag jfs/insitutional addresses ---- institutional_matches<-hv_addresses_processed %>% select(begins_with_number_and_space,street_number,street_name,city_clean,full_address_clean) %>% unique() %>% fuzzyjoin::regex_inner_join(institutional_regex_exp,by=c(full_address_clean='regex_exp')) %>% select(label,full_address_clean) %>% filter(!is.na(label)) %>% dplyr::union(not_regex_match_jfs) %>% group_by(full_address_clean) %>% slice(which.max(length(label))) %>% ungroup() output_hv_address_mapping<-hv_addresses_processed %>% select(Address, City, StateOrProvinceAbbreviation, PostalCode, AddressKey, full_address_clean, po_box_flag,begins_with_number_and_space) %>% unique() %>% dplyr::left_join(institutional_matches,by='full_address_clean') write_csv(output_hv_address_mapping, glue("current_caboodle_hv_address_mapping.csv")) geocoded_addresses<-read_csv('to_date_HV_geocode.csv',na = 'NA', col_types = cols(.default=col_character(), lat=col_double(), lon=col_double(), score=col_double())) %>% clean_names() unique_addresses<-hv_addresses_processed %>% dplyr::anti_join(geocoded_addresses,by=c('full_address_clean')) %>% filter(begins_with_number_and_space) %>% select(address=full_address_clean) %>% unique() # Output address so that it could be geocoded ---- write_csv(unique_addresses,glue("C:/DeGauss/{lubridate::today()}_caboodle_hv_addresses.csv")) # Use Degauss and come back and import and check geocoding ---- geocode_import_cols<-cols( address = col_character(), matched_street = col_character(), matched_zip = col_character(), matched_city = col_character(), matched_state = col_character(), lat = col_double(), lon = col_double(), score = col_double(), precision = col_character(), geocode_result = col_character() ) new_geocoded<-read_csv(glue("C:/DeGauss/{lubridate::today()}_caboodle_hv_addresses_geocoded_v3.0.2_census_block_group_0.6.0_2020_census_block_group_0.6.0_2010.csv"), col_types = cols(.default=col_character(), lat=col_double(), lon=col_double(), score=col_double())) %>% clean_names() %>% rename(full_address_clean=address) geocoded_addresses %>% union(new_geocoded) %>% write_csv('to_date_HV_geocode.csv') # used fuzzy matching to find some addresses that didn't get matched with regex # Need to move further down code but for reference: # jfs_addresses<-readxl::read_xlsx('JFS_Offices.xlsx') # # jfs_street_names<-jfs_addresses %>% # select(street_number,street_name,city) %>% # unique() %>% # mutate(street_name_expand = sub('N ','North ',street_name), # street_name_expand = sub('S ','South ',street_name_expand), # street_name_expand = sub('E ','East ',street_name_expand), # street_name_expand = sub('W ','West ',street_name_expand), # street_name_expand = sub('SR-','State Route ',street_name_expand), # street_name_expand = sub('St Rte','State Route ',street_name_expand) # ) %>% # pivot_longer(cols=c('street_name','street_name_expand'),values_to = 'street_name') %>% # select(-name) %>% # unique() %>% # mutate_all(~toupper(.)) # # fuzzy_join_to_processed_addresses<-fuzzyjoin::stringdist_left_join(jfs_address_match %>% # filter(begins_with_number_and_space,!is.na(street_name)), # jfs_street_names,by=c('street_number','street_name','city_clean'='city'), # method='jw', # max_dist=.12, # ignore_case=TRUE, # distance_col='dist') # # no_match_fuzzy<-fuzzy_join_to_processed_addresses %>% # filter(is.na(label),!is.na(street_name.y)) %>% # select(full_address_clean) %>% unique() ```
cole-brokamp commented 1 year ago

I keep struggling with how we can best support Cincinnati institutional addresses without interfering with users in other cities (e.g., 3333 Burnet Ave, Syracuse, NY 13206).

First thought is to use a DeGAUSS argument, but that is already taken for score threshold.

How can we provide a DeGAUSS user a way to optionally filter these out without using a new argument?