Open andrewallenbruce opened 8 months ago
library(nppez)
library(fs)
library(tidyverse)
library(janitor)
df2chr <- function(df) {
df |>
dplyr::mutate(
dplyr::across(
dplyr::where(is.numeric), as.character))
}
test <- path(path_wd(), "inst/tmp")
dir_create(test)
test
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp
x <- nppez::ask(
save = TRUE,
path = test
)
#> Download Time: 0.5 sec elapsed
x
#> # A tibble: 5 × 4
#> file url date size
#> <chr> <chr> <date> <fs::b>
#> 1 NPPES_Data_Dissemination_March_2024.zip https://… 2024-03-11 935.96M
#> 2 NPPES_Deactivated_NPI_Report_031124.zip https://… 2024-03-11 1.96M
#> 3 NPPES_Data_Dissemination_030424_031024_Weekly.zip https://… 2024-03-04 3.83M
#> 4 NPPES_Data_Dissemination_031124_031724_Weekly.zip https://… 2024-03-11 3.8M
#> 5 NPPES_Data_Dissemination_031824_032424_Weekly.zip https://… 2024-03-18 4.31M
y <- nppez::grab(
obj = x,
files = "NPPES_Data_Dissemination_030424_031024_Weekly.zip",
path = test
)
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp
#> ├── NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> ├── NPPES_Data_Dissemination_March_2024.csv
#> └── NPPES_Download_Log_2024-03-30.csv
y
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Data_Dissemination_March_2024.csv
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Download_Log_2024-03-30.csv
z <- nppez::peek(
path = test
)
z
#> $NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> # A tibble: 10 × 4
#> zipfile filename compressed uncompressed
#> <chr> <chr> <fs::byte> <fs::bytes>
#> 1 NPPES_Data_Dissemination_030424_031024_Week… pl_pfil… 202.92K 616.5K
#> 2 NPPES_Data_Dissemination_030424_031024_Week… pl_pfil… 160 578
#> 3 NPPES_Data_Dissemination_030424_031024_Week… otherna… 41.15K 119.9K
#> 4 NPPES_Data_Dissemination_030424_031024_Week… otherna… 53 86
#> 5 NPPES_Data_Dissemination_030424_031024_Week… npidata… 2.61M 28.5M
#> 6 NPPES_Data_Dissemination_030424_031024_Week… npidata… 1.35K 12K
#> 7 NPPES_Data_Dissemination_030424_031024_Week… endpoin… 75.46K 401K
#> 8 NPPES_Data_Dissemination_030424_031024_Week… endpoin… 154 431
#> 9 NPPES_Data_Dissemination_030424_031024_Week… NPPES_D… 459.91K 556.2K
#> 10 NPPES_Data_Dissemination_030424_031024_Week… NPPES_D… 460.77K 543.7K
files <- z$NPPES_Data_Dissemination_030424_031024_Weekly.zip |>
dplyr::filter(
stringr::str_detect(
filename,
".csv"
)
) |>
dplyr::pull(filename)
zip::unzip(
zipfile = y[1], # from nppez::grab()
exdir = test,
files = files
)
headers <- stringr::str_c(
test,
stringr::str_subset(
files,
"fileheader"
),
sep =
"/"
)
headers_names <- headers |>
basename() |>
stringr::str_remove_all(
pattern = stringr::fixed(".csv")
)
names(headers) <- headers_names
headers
#> pl_pfile_20240304-20240310_fileheader
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/pl_pfile_20240304-20240310_fileheader.csv"
#> othername_pfile_20240304-20240310_fileheader
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/othername_pfile_20240304-20240310_fileheader.csv"
#> npidata_pfile_20240304-20240310_fileheader
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/npidata_pfile_20240304-20240310_fileheader.csv"
#> endpoint_pfile_20240304-20240310_fileheader
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/endpoint_pfile_20240304-20240310_fileheader.csv"
files_to_read <- stringr::str_c(
test, stringr::str_subset(
files,
"fileheader",
negate = TRUE
),
sep = "/"
)
files_to_read_names <- files_to_read |>
basename() |>
stringr::str_remove_all(
pattern = stringr::fixed(
".csv"
)
)
names(files_to_read) <- files_to_read_names
files_to_read
#> pl_pfile_20240304-20240310
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/pl_pfile_20240304-20240310.csv"
#> othername_pfile_20240304-20240310
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/othername_pfile_20240304-20240310.csv"
#> npidata_pfile_20240304-20240310
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/npidata_pfile_20240304-20240310.csv"
#> endpoint_pfile_20240304-20240310
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/endpoint_pfile_20240304-20240310.csv"
nppes <- files_to_read |>
purrr::map(
readr::read_csv,
col_types = "c"
) |>
purrr::map(df2chr)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#> dat <- vroom(...)
#> problems(dat)
nppes_headers <- headers |>
purrr::map(
readr::read_csv,
col_types = "c"
) |>
purrr::map(
janitor::clean_names
)
nppes$`pl_pfile_20240304-20240310` |>
janitor::clean_names()
#> # A tibble: 6,557 × 10
#> npi provider_secondary_p…¹ provider_secondary_p…² provider_secondary_p…³
#> <chr> <chr> <chr> <chr>
#> 1 1578536… 34800 Bob Wilson Dr NMCSD, ATTN: MEDICAL San Diego
#> 2 1538926… 2533 Placid Pl <NA> Virginia Beach
#> 3 1447017… 2674 Clarendon Ave Ap… <NA> Huntington Park
#> 4 1346007… 801 Skokie Blvd Ste 1… <NA> Northbrook
#> 5 1598522… 1014 Sassafras St <NA> Willow Springs
#> 6 1316704… 7785 Sierra Dr <NA> Granite Bay
#> 7 1902663… 1610 Woods Ct <NA> Hood River
#> 8 1831956… 643 Skyview Ln <NA> Narrows
#> 9 1518724… 3026 NW Canyon Dr <NA> Redmond
#> 10 1194582… 3201 Wilshire Blvd St… <NA> Santa Monica
#> # ℹ 6,547 more rows
#> # ℹ abbreviated names:
#> # ¹provider_secondary_practice_location_address_address_line_1,
#> # ²provider_secondary_practice_location_address_address_line_2,
#> # ³provider_secondary_practice_location_address_city_name
#> # ℹ 6 more variables:
#> # provider_secondary_practice_location_address_state_name <chr>, …
nppes$`endpoint_pfile_20240304-20240310` |>
janitor::clean_names()
#> # A tibble: 2,015 × 19
#> npi endpoint_type endpoint_type_description endpoint affiliation
#> <chr> <chr> <chr> <chr> <chr>
#> 1 1114671492 CONNECT CONNECT URL www.synergyte… Y
#> 2 1114671492 CONNECT CONNECT URL https://www.s… N
#> 3 1154188597 OTHERS Other URL MaximCareMobi… N
#> 4 1033891411 DIRECT Direct Messaging Address lcase227604@d… N
#> 5 1770251225 OTHERS Other URL gentle.profes… N
#> 6 1831254705 DIRECT Direct Messaging Address jreilly11322@… N
#> 7 1831783372 FHIR FHIR URL https://EpicF… N
#> 8 1831783372 DIRECT Direct Messaging Address cschreiter257… N
#> 9 1740047034 OTHERS Other URL Springfield N
#> 10 1679147623 DIRECT Direct Messaging Address hbazzy1358174… N
#> # ℹ 2,005 more rows
#> # ℹ 14 more variables: endpoint_description <chr>,
#> # affiliation_legal_business_name <chr>, use_code <chr>,
#> # use_description <chr>, other_use_description <chr>, content_type <chr>,
#> # content_description <chr>, other_content_description <chr>,
#> # affiliation_address_line_one <chr>, affiliation_address_line_two <chr>,
#> # affiliation_address_city <chr>, affiliation_address_state <chr>, …
nppes$`othername_pfile_20240304-20240310` |>
janitor::clean_names()
#> # A tibble: 2,663 × 3
#> npi provider_other_organization_name provider_other_organization…¹
#> <chr> <chr> <chr>
#> 1 1083471452 Parsley Medical Group of NE 3
#> 2 1114671492 21 Century Works LLC 3
#> 3 1114671492 Synergy Telemed 3
#> 4 1114671492 21 Century Works LLC 4
#> 5 1174380547 Parsley Medical Group of MT 3
#> 6 1205693660 Parsley Medical Group of KY 3
#> 7 1215794664 KwikRx Pharmacy 3
#> 8 1245091917 Southern Om Telepsychiatric Services 3
#> 9 1245097609 Our bags of Love Foundation 3
#> 10 1265299614 Solina Counseling and Wellness 3
#> # ℹ 2,653 more rows
#> # ℹ abbreviated name: ¹provider_other_organization_name_type_code
nppes$`npidata_pfile_20240304-20240310` |>
janitor::clean_names()
#> # A tibble: 24,790 × 330
#> npi entity_type_code replacement_npi employer_identification_number_…¹
#> <chr> <chr> <lgl> <chr>
#> 1 1083265771 1 NA <NA>
#> 2 1194596031 1 NA <NA>
#> 3 1548027030 2 NA <UNAVAIL>
#> 4 1275399628 1 NA <NA>
#> 5 1871358960 1 NA <NA>
#> 6 1134826803 1 NA <NA>
#> 7 1477318152 1 NA <NA>
#> 8 1114671492 2 NA <UNAVAIL>
#> 9 1306624853 <NA> NA <NA>
#> 10 1811410921 1 NA <NA>
#> # ℹ 24,780 more rows
#> # ℹ abbreviated name: ¹employer_identification_number_ein
#> # ℹ 326 more variables: provider_organization_name_legal_business_name <chr>,
#> # provider_last_name_legal_name <chr>, provider_first_name <chr>,
#> # provider_middle_name <chr>, provider_name_prefix_text <chr>,
#> # provider_name_suffix_text <chr>, provider_credential_text <chr>,
#> # provider_other_organization_name <chr>, …
fs::dir_delete(test)
Created on 2024-03-30 with reprex v2.1.0
Look into maestro for automated pipelines
Ask:
Grab: