andrewallenbruce / nppez

Regularly Dispense NPPES Registry Data
https://andrewallenbruce.github.io/nppez/
Creative Commons Attribution 4.0 International
2 stars 2 forks source link

Download Steps #1

Open andrewallenbruce opened 8 months ago

andrewallenbruce commented 8 months ago
  1. Ask: List files for download
  2. Grab: Download selected files
  3. Peek: Inspect downloaded zips for selection
  4. Prune: Choose files to unzip
  5. Peel: Unzip chosen files
andrewallenbruce commented 8 months ago
library(nppez)
library(fs)
library(tidyverse)
library(janitor)

df2chr <- function(df) {
  df |>
    dplyr::mutate(
      dplyr::across(
        dplyr::where(is.numeric), as.character))
}

test <- path(path_wd(), "inst/tmp")

dir_create(test)

test
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp

x <- nppez::ask(
  save = TRUE,
  path = test
)
#> Download Time: 0.5 sec elapsed

x
#> # A tibble: 5 × 4
#>   file                                              url       date          size
#>   <chr>                                             <chr>     <date>     <fs::b>
#> 1 NPPES_Data_Dissemination_March_2024.zip           https://… 2024-03-11 935.96M
#> 2 NPPES_Deactivated_NPI_Report_031124.zip           https://… 2024-03-11   1.96M
#> 3 NPPES_Data_Dissemination_030424_031024_Weekly.zip https://… 2024-03-04   3.83M
#> 4 NPPES_Data_Dissemination_031124_031724_Weekly.zip https://… 2024-03-11    3.8M
#> 5 NPPES_Data_Dissemination_031824_032424_Weekly.zip https://… 2024-03-18   4.31M

y <- nppez::grab(
  obj   = x, 
  files = "NPPES_Data_Dissemination_030424_031024_Weekly.zip",
  path  = test
)
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp
#> ├── NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> ├── NPPES_Data_Dissemination_March_2024.csv
#> └── NPPES_Download_Log_2024-03-30.csv

y
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Data_Dissemination_March_2024.csv
#> C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/NPPES_Download_Log_2024-03-30.csv

z <- nppez::peek(
  path = test
  )

z
#> $NPPES_Data_Dissemination_030424_031024_Weekly.zip
#> # A tibble: 10 × 4
#>    zipfile                                      filename compressed uncompressed
#>    <chr>                                        <chr>    <fs::byte>  <fs::bytes>
#>  1 NPPES_Data_Dissemination_030424_031024_Week… pl_pfil…    202.92K       616.5K
#>  2 NPPES_Data_Dissemination_030424_031024_Week… pl_pfil…        160          578
#>  3 NPPES_Data_Dissemination_030424_031024_Week… otherna…     41.15K       119.9K
#>  4 NPPES_Data_Dissemination_030424_031024_Week… otherna…         53           86
#>  5 NPPES_Data_Dissemination_030424_031024_Week… npidata…      2.61M        28.5M
#>  6 NPPES_Data_Dissemination_030424_031024_Week… npidata…      1.35K          12K
#>  7 NPPES_Data_Dissemination_030424_031024_Week… endpoin…     75.46K         401K
#>  8 NPPES_Data_Dissemination_030424_031024_Week… endpoin…        154          431
#>  9 NPPES_Data_Dissemination_030424_031024_Week… NPPES_D…    459.91K       556.2K
#> 10 NPPES_Data_Dissemination_030424_031024_Week… NPPES_D…    460.77K       543.7K

files <- z$NPPES_Data_Dissemination_030424_031024_Weekly.zip |> 
  dplyr::filter(
    stringr::str_detect(
      filename, 
      ".csv"
      )
    ) |> 
  dplyr::pull(filename)

zip::unzip(
  zipfile = y[1], # from nppez::grab()
  exdir = test,
  files = files
)

headers <- stringr::str_c(
  test, 
  stringr::str_subset(
    files, 
    "fileheader"
    ), 
  sep = 
    "/"
  )

headers_names <- headers |> 
  basename() |> 
  stringr::str_remove_all(
    pattern = stringr::fixed(".csv")
    )

names(headers) <- headers_names

headers
#>                                                                                                    pl_pfile_20240304-20240310_fileheader 
#>        "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/pl_pfile_20240304-20240310_fileheader.csv" 
#>                                                                                             othername_pfile_20240304-20240310_fileheader 
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/othername_pfile_20240304-20240310_fileheader.csv" 
#>                                                                                               npidata_pfile_20240304-20240310_fileheader 
#>   "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/npidata_pfile_20240304-20240310_fileheader.csv" 
#>                                                                                              endpoint_pfile_20240304-20240310_fileheader 
#>  "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/endpoint_pfile_20240304-20240310_fileheader.csv"

files_to_read <- stringr::str_c(
  test, stringr::str_subset(
    files, 
    "fileheader", 
    negate = TRUE
    ), 
  sep = "/"
  )

files_to_read_names <- files_to_read |> 
  basename() |> 
  stringr::str_remove_all(
    pattern = stringr::fixed(
      ".csv"
      )
    )

names(files_to_read) <- files_to_read_names

files_to_read
#>                                                                                                    pl_pfile_20240304-20240310 
#>        "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/pl_pfile_20240304-20240310.csv" 
#>                                                                                             othername_pfile_20240304-20240310 
#> "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/othername_pfile_20240304-20240310.csv" 
#>                                                                                               npidata_pfile_20240304-20240310 
#>   "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/npidata_pfile_20240304-20240310.csv" 
#>                                                                                              endpoint_pfile_20240304-20240310 
#>  "C:/Users/Andrew/AppData/Local/Temp/RtmpKemsb1/reprex-1f2cf623c7a-cushy-kitty/inst/tmp/endpoint_pfile_20240304-20240310.csv"

nppes <- files_to_read |>
  purrr::map(
    readr::read_csv, 
    col_types = "c"
    ) |>
  purrr::map(df2chr)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)

nppes_headers <- headers |>
  purrr::map(
    readr::read_csv, 
    col_types = "c"
    ) |>
  purrr::map(
    janitor::clean_names
    )

nppes$`pl_pfile_20240304-20240310` |> 
  janitor::clean_names()
#> # A tibble: 6,557 × 10
#>    npi      provider_secondary_p…¹ provider_secondary_p…² provider_secondary_p…³
#>    <chr>    <chr>                  <chr>                  <chr>                 
#>  1 1578536… 34800 Bob Wilson Dr    NMCSD, ATTN: MEDICAL   San Diego             
#>  2 1538926… 2533 Placid Pl         <NA>                   Virginia Beach        
#>  3 1447017… 2674 Clarendon Ave Ap… <NA>                   Huntington Park       
#>  4 1346007… 801 Skokie Blvd Ste 1… <NA>                   Northbrook            
#>  5 1598522… 1014 Sassafras St      <NA>                   Willow Springs        
#>  6 1316704… 7785 Sierra Dr         <NA>                   Granite Bay           
#>  7 1902663… 1610 Woods Ct          <NA>                   Hood River            
#>  8 1831956… 643 Skyview Ln         <NA>                   Narrows               
#>  9 1518724… 3026 NW Canyon Dr      <NA>                   Redmond               
#> 10 1194582… 3201 Wilshire Blvd St… <NA>                   Santa Monica          
#> # ℹ 6,547 more rows
#> # ℹ abbreviated names:
#> #   ¹​provider_secondary_practice_location_address_address_line_1,
#> #   ²​provider_secondary_practice_location_address_address_line_2,
#> #   ³​provider_secondary_practice_location_address_city_name
#> # ℹ 6 more variables:
#> #   provider_secondary_practice_location_address_state_name <chr>, …

nppes$`endpoint_pfile_20240304-20240310` |> 
  janitor::clean_names()
#> # A tibble: 2,015 × 19
#>    npi        endpoint_type endpoint_type_description endpoint       affiliation
#>    <chr>      <chr>         <chr>                     <chr>          <chr>      
#>  1 1114671492 CONNECT       CONNECT URL               www.synergyte… Y          
#>  2 1114671492 CONNECT       CONNECT URL               https://www.s… N          
#>  3 1154188597 OTHERS        Other URL                 MaximCareMobi… N          
#>  4 1033891411 DIRECT        Direct Messaging Address  lcase227604@d… N          
#>  5 1770251225 OTHERS        Other URL                 gentle.profes… N          
#>  6 1831254705 DIRECT        Direct Messaging Address  jreilly11322@… N          
#>  7 1831783372 FHIR          FHIR URL                  https://EpicF… N          
#>  8 1831783372 DIRECT        Direct Messaging Address  cschreiter257… N          
#>  9 1740047034 OTHERS        Other URL                 Springfield    N          
#> 10 1679147623 DIRECT        Direct Messaging Address  hbazzy1358174… N          
#> # ℹ 2,005 more rows
#> # ℹ 14 more variables: endpoint_description <chr>,
#> #   affiliation_legal_business_name <chr>, use_code <chr>,
#> #   use_description <chr>, other_use_description <chr>, content_type <chr>,
#> #   content_description <chr>, other_content_description <chr>,
#> #   affiliation_address_line_one <chr>, affiliation_address_line_two <chr>,
#> #   affiliation_address_city <chr>, affiliation_address_state <chr>, …

nppes$`othername_pfile_20240304-20240310` |> 
  janitor::clean_names()
#> # A tibble: 2,663 × 3
#>    npi        provider_other_organization_name     provider_other_organization…¹
#>    <chr>      <chr>                                <chr>                        
#>  1 1083471452 Parsley Medical Group of NE          3                            
#>  2 1114671492 21 Century Works LLC                 3                            
#>  3 1114671492 Synergy Telemed                      3                            
#>  4 1114671492 21 Century Works LLC                 4                            
#>  5 1174380547 Parsley Medical Group of MT          3                            
#>  6 1205693660 Parsley Medical Group of KY          3                            
#>  7 1215794664 KwikRx Pharmacy                      3                            
#>  8 1245091917 Southern Om Telepsychiatric Services 3                            
#>  9 1245097609 Our bags of Love Foundation          3                            
#> 10 1265299614 Solina Counseling and Wellness       3                            
#> # ℹ 2,653 more rows
#> # ℹ abbreviated name: ¹​provider_other_organization_name_type_code

nppes$`npidata_pfile_20240304-20240310` |> 
  janitor::clean_names()
#> # A tibble: 24,790 × 330
#>    npi        entity_type_code replacement_npi employer_identification_number_…¹
#>    <chr>      <chr>            <lgl>           <chr>                            
#>  1 1083265771 1                NA              <NA>                             
#>  2 1194596031 1                NA              <NA>                             
#>  3 1548027030 2                NA              <UNAVAIL>                        
#>  4 1275399628 1                NA              <NA>                             
#>  5 1871358960 1                NA              <NA>                             
#>  6 1134826803 1                NA              <NA>                             
#>  7 1477318152 1                NA              <NA>                             
#>  8 1114671492 2                NA              <UNAVAIL>                        
#>  9 1306624853 <NA>             NA              <NA>                             
#> 10 1811410921 1                NA              <NA>                             
#> # ℹ 24,780 more rows
#> # ℹ abbreviated name: ¹​employer_identification_number_ein
#> # ℹ 326 more variables: provider_organization_name_legal_business_name <chr>,
#> #   provider_last_name_legal_name <chr>, provider_first_name <chr>,
#> #   provider_middle_name <chr>, provider_name_prefix_text <chr>,
#> #   provider_name_suffix_text <chr>, provider_credential_text <chr>,
#> #   provider_other_organization_name <chr>, …

fs::dir_delete(test)

Created on 2024-03-30 with reprex v2.1.0

andrewallenbruce commented 8 months ago
andrewallenbruce commented 3 months ago

Look into maestro for automated pipelines