tidyverse / haven

Read SPSS, Stata and SAS files from R
https://haven.tidyverse.org
Other
424 stars 117 forks source link

Progress Bar for XPT #730

Open muschellij2 opened 1 year ago

muschellij2 commented 1 year ago

Is it possible to have a progress bar for reading in files, specifically SAS XPORT files?

Here's an example of reading in a small file (6Mb) and everything works fine, but it'd be helpful, if possible, to have a progress bar for larger files (such as https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/PAXMIN_G.XPT, which is 7.6Gb).

library(haven)
library(curl)
#> Using libcurl 7.79.1 with LibreSSL/3.3.6
xpt = tempfile(fileext = ".xpt")
curl::curl_download("https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/PAXDAY_G.XPT",
                    destfile = xpt)
haven::read_xpt(xpt)
#> # A tibble: 61,168 × 15
#>     SEQN PAXDAYD PAXDAYWD PAXSSNDP PAXMSTD    PAXTMD PAXAISMD PAXVMD PAXMTSD
#>    <dbl> <chr>   <chr>       <dbl> <chr>       <dbl>    <dbl>  <dbl>   <dbl>
#>  1 62161 1       7               0 "12:30:00"    690  1426946    690  4636. 
#>  2 62161 2       1         3312000 " 0:00:00"   1440  3123802   1440 12531. 
#>  3 62161 3       2        10224000 " 0:00:00"   1440  2779464   1440 14013. 
#>  4 62161 4       3        17136000 " 0:00:00"   1440  2724602   1440 16982. 
#>  5 62161 5       4        24048000 " 0:00:00"   1440  3144826   1440 11718. 
#>  6 62161 6       5        30960000 " 0:00:00"   1440  3001421   1440 16185. 
#>  7 62161 7       6        37872000 " 0:00:00"   1440  4079193   1440  7734. 
#>  8 62161 8       7        44784000 " 0:00:00"   1440  2798564   1440 12479. 
#>  9 62161 9       1        51696000 " 0:00:00"    760  3463791    760    44.8
#> 10 62163 1       7               0 "17:30:00"    390    33043    390  6106. 
#> # ℹ 61,158 more rows
#> # ℹ 6 more variables: PAXWWMD <dbl>, PAXSWMD <dbl>, PAXNWMD <dbl>,
#> #   PAXUMD <dbl>, PAXLXSD <dbl>, PAXQFD <dbl>

Created on 2023-08-15 with reprex v2.0.2

gorcha commented 1 year ago

Hi @muschellij2, thanks for the feature request.

Adding progress bars is a bit of work since it would require call backs from the readstat code so I'm unlikely to look at this at least in the short term. I'll keep this issue open in case the opportunity presents itself though, and always happy to review a PR 🙂

muschellij2 commented 1 week ago

Is it possible to get the number of rows from an xpt/sas7bdat/dta in order to create one ourselves?

muschellij2 commented 1 week ago

I think this may be a quick function to determine the number of rows then I can simply wrap it all in all using https://cran.r-project.org/web/packages/progress/index.html.

library(haven)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

get_haven_nrow = function(
    file, skip = 1e10, 
    n_check = 1000L,
    max_iter = 1000L,
    func = read_dta) {
  original_n_check = n_check
  x = func(
    file, 
    n_max = n_check,  
    skip = skip
  )
  min_value = 0
  max_value = 1e30
  skip_table = dplyr::tibble(
    skip = skip, nr = nrow(x),
    max = max_value, min = min_value,
    n_check = n_check
  )

  i = 1
  while (TRUE) {

    i <<- i + 1
    if (i >= max_iter) {
      warning("Ending early, ", i, " iterations")
    }
    # for (i in 1:40) {
    x = func(
      file, 
      n_max = n_check,  
      skip = skip
    )

    if (nrow(x) == 0) {
      max_value = skip
    } else {
      min_value = skip
    }
    diff = (max_value - min_value)
    n_check = min(n_check, diff)
    skip = round(min_value + diff / 2)
    skip_table = dplyr::bind_rows(
      dplyr::tibble(
        skip = skip, nr = nrow(x),
        max = max_value, min = min_value,
        n_check = n_check
      ),
      skip_table
    )
    if (diff <= 1) {
      break
    }
  }
  list(
    file = file,
    skip_table = skip_table,
    nrow = skip_table$max[1]
  )
}

dta_url = "https://stats.idre.ucla.edu/stat/stata/dae/binary.dta"
file = tempfile(fileext = ".dta")
download.file(dta_url, file, mode = "wb")
res = get_haven_nrow(file)
res$nrow
#> [1] 400
res
#> $file
#> [1] "/var/folders/1s/wrtqcpxn685_zk570bnx9_rr0000gr/T//Rtmp83As8L/filefd201957e81d.dta"
#> 
#> $skip_table
#> # A tibble: 35 × 5
#>     skip    nr   max   min n_check
#>    <dbl> <int> <dbl> <dbl>   <dbl>
#>  1   400     1   400   399       1
#>  2   399     2   400   398       2
#>  3   398     4   400   396       4
#>  4   396     0   400   391       9
#>  5   400     9   410   391      19
#>  6   391     0   410   372      38
#>  7   410    28   447   372      75
#>  8   372     0   447   298     149
#>  9   447   102   596   298     298
#> 10   298     0   596     0     596
#> # ℹ 25 more rows
#> 
#> $nrow
#> [1] 400

sas_url = "https://stats.idre.ucla.edu/wp-content/uploads/2016/02/binary.sas7bdat"
sas_file = tempfile(fileext = ".sas7bdat")
download.file(sas_url, sas_file, mode = "wb")
res = get_haven_nrow(sas_file, func = read_sas)
res$nrow
#> [1] 400
res
#> $file
#> [1] "/var/folders/1s/wrtqcpxn685_zk570bnx9_rr0000gr/T//Rtmp83As8L/filefd205a47d4ea.sas7bdat"
#> 
#> $skip_table
#> # A tibble: 35 × 5
#>     skip    nr   max   min n_check
#>    <dbl> <int> <dbl> <dbl>   <dbl>
#>  1   400     1   400   399       1
#>  2   399     2   400   398       2
#>  3   398     4   400   396       4
#>  4   396     0   400   391       9
#>  5   400     9   410   391      19
#>  6   391     0   410   372      38
#>  7   410    28   447   372      75
#>  8   372     0   447   298     149
#>  9   447   102   596   298     298
#> 10   298     0   596     0     596
#> # ℹ 25 more rows
#> 
#> $nrow
#> [1] 400

Created on 2024-11-05 with reprex v2.1.1

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.4.0 (2024-04-24) #> os macOS Sonoma 14.4.1 #> system x86_64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz America/New_York #> date 2024-11-05 #> pandoc 3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/x86_64/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0) #> digest 0.6.37 2024-08-19 [1] CRAN (R 4.4.1) #> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0) #> evaluate 1.0.0 2024-09-17 [1] CRAN (R 4.4.1) #> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.0) #> fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0) #> forcats 1.0.0 2023-01-29 [1] CRAN (R 4.4.0) #> fs 1.6.4 2024-04-25 [1] CRAN (R 4.4.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0) #> glue 1.8.0 2024-09-30 [1] CRAN (R 4.4.1) #> haven * 2.5.4 2023-11-30 [1] CRAN (R 4.4.0) #> hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.0) #> htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0) #> knitr 1.48 2024-07-07 [1] CRAN (R 4.4.0) #> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0) #> readr 2.1.5 2024-01-10 [1] CRAN (R 4.4.0) #> reprex 2.1.1 2024-07-06 [1] CRAN (R 4.4.0) #> rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.0) #> rmarkdown 2.28 2024-08-17 [1] CRAN (R 4.4.1) #> rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0) #> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.4.0) #> tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0) #> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.0) #> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.0) #> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0) #> withr 3.0.0 2024-01-16 [1] CRAN (R 4.4.0) #> xfun 0.47 2024-08-17 [1] CRAN (R 4.4.1) #> yaml 2.3.10 2024-07-26 [1] CRAN (R 4.4.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```