kosukeimai / fastLink

R package fastLink: Fast Probabilistic Record Linkage
272 stars 48 forks source link

R aborts during linkage of two files when n rows > 25k #86

Open wbakerrobinson opened 2 months ago

wbakerrobinson commented 2 months ago

I would like to link two files of size 50k rows. Whenever I try to link these files R aborts on me. I systematically sampled n records from each file starting at 10k. When n gets to 25k R starts aborting. Is anyone else having this problem with the most up to date versions of R, and fastLink?

My computer has 64 gb ram. Here is the session information:
R version 4.4.1 (2024-06-14 ucrt) Platform: x86_64-w64-mingw32/x64 Running under: Windows 10 x64 (build 19045)

Matrix products: default

locale: [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 LC_MONETARY=English_United States.utf8 [4] LC_NUMERIC=C LC_TIME=English_United States.utf8

time zone: America/Los_Angeles tzcode source: internal

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] tidycensus_1.6.5 stringr_1.5.1 fastLink_0.6.1 dplyr_1.1.4

loaded via a namespace (and not attached): [1] rappdirs_0.3.3 utf8_1.2.4 generics_0.1.3 tidyr_1.3.1 class_7.3-22 xml2_1.3.6 lpSolve_5.6.20
[8] KernSmooth_2.23-24 adagio_0.9.2 gtools_3.9.5 stringi_1.8.4 lattice_0.22-6 hms_1.1.3 magrittr_2.0.3
[15] grid_4.4.1 iterators_1.0.14 foreach_1.5.2 doParallel_1.0.17 jsonlite_1.8.8 Matrix_1.7-0 e1071_1.7-14
[22] DBI_1.2.3 httr_1.4.7 rvest_1.0.4 purrr_1.0.2 fansi_1.0.6 stringdist_0.9.12 codetools_0.2-20
[29] cli_3.6.3 rlang_1.1.4 tigris_2.1 crayon_1.5.3 units_0.8-5 plotrix_3.8-4 tools_4.4.1
[36] parallel_4.4.1 tzdb_0.4.0 uuid_1.2-1 vctrs_0.6.5 R6_2.5.1 proxy_0.4-27 lifecycle_1.0.4
[43] classInt_0.4-10 pkgconfig_2.0.3 pillar_1.9.0 data.table_1.16.0 glue_1.7.0 Rcpp_1.0.13 sf_1.0-17
[50] tibble_3.2.1 tidyselect_1.2.1 rstudioapi_0.16.0 readr_2.1.5 compiler_4.4.1

Here is a reproducible example:

# Rstudio abort repex
# WBR
# 9/12/2024

# Packages
library(dplyr)
library(fastLink)
library(stringr)
library(tidycensus)

# Session information
sessionInfo()

# get zip code data for sim
or_zip <- get_acs(geography = "zcta",
                  variables = c("B01001_001"),
                  year = 2020,
                  survery = "acs5",
                  geometry = FALSE) %>% 
  select(GEOID, estimate) %>% 
  rename(zip_code = GEOID,
         pop = estimate) %>% 
  filter(str_detect(zip_code, "^97") & pop > 0)

or_zip_tot <- sum(or_zip$pop)

or_zip <- mutate(or_zip, pop_pct = pop/or_zip_tot)

# Function to create sim data
# Goal isn't to realistically simulate my dataset just create something to show problem
sim_data <- function(n_sample = 25000)
{
  tibble(age = rnorm(n_sample, mean = 50, sd = 20),
         sex = rbinom(n_sample, size = 1, prob = 0.5),
                 zip_code = sample(or_zip$zip_code, size = n_sample, replace = TRUE, or_zip$pop_pct),
                 disp_cat = sample(c(1, 2, 3, 4), size = n_sample, replace = TRUE, c(0.00278, 0.00915, 0.8811, 0.10697)),
                 LOS = rnbinom(n_sample, size = 2, prob = 0.2),
                 hours_since = round(runif(n_sample, 0, 8760)),
                 hospital_id = "test hosp") %>% 
    mutate(age = as.numeric(if_else(age < 0, NA_integer_, floor(age))),
           sex = factor(sex, labels = c("Male", "Female")),
           disp_cat = factor(disp_cat,
                             labels = c("Admit", "Deceased", "Discharged", "Transferred")),
           LOS = as.numeric(LOS),
           hours_since = as.numeric(hours_since))
}

# Function to link
blocked_link_fx <- function(df_split_A, df_split_B)
{
  require(fs)
  require(fastLink)

  df_dim_A <- dim(df_split_A)
  df_dim_B <- dim(df_split_B)
  cat("Facility:", unique(df_split_A$hospital_id), "\n")
  cat("DF A Num rows:", df_dim_A[1], "Num cols:", df_dim_A[2], "\n")
  cat("DF B Num rows:", df_dim_B[1], "Num cols:", df_dim_B[2], "\n")

  tryCatch({
    link_result <- fastLink(df_split_A, df_split_B,
                            varnames = c("age", "sex", "zip_code",
                                         "disp_cat", "LOS", "hours_since"),
                            numeric.match = c("age", "LOS", "hours_since"),
                            partial.match = c("age", "LOS", "hours_since"),
                            cut.a.num = 0.5,
                            cut.p.num = 2)

    return(link_result)
  }, error = function(e) {
    message("Error in processing: ", e)
    return(NULL)
  })
}

# Create sample data n = 10000
n <- 10000
n_exact <- 3000
set.seed(3)
df_A <- sim_data(n)

set.seed(6)
df_B <- bind_rows(slice_sample(df_A, n = n_exact),
                  sim_data(n - n_exact))

# Test linkage w/ 10000
test_link1 <- blocked_link_fx(df_A, df_B)

# Create sample data n = 25000
n <- 25000
n_exact <- 3000

set.seed(3)
df_A <- sim_data(n)

set.seed(6)
df_B <- bind_rows(slice_sample(df_A, n = n_exact),
                  sim_data(n - n_exact))

# Test linkage w/ 25000
test_link2 <- blocked_link_fx(df_A, df_B)