bschneidr / fastsurvey

A fork of the `survey` R package, using {Rcpp}
8 stars 0 forks source link

Bug: `Mat::operator(): index out of bounds` #2

Closed bschneidr closed 2 years ago

bschneidr commented 2 years ago

Reproducible example:

suppressPackageStartupMessages({
  library(survey)
  library(fastsurvey)
})
data(api)

clusters<-table(apiclus2$dnum)
clusters<-clusters[clusters>1 & names(clusters)!="639"]
apiclus2a<-subset(apiclus2, dnum %in% as.numeric(names(clusters)))
dclus2<-svydesign(id=~dnum+snum, fpc=~fpc1+fpc2, data=apiclus2a)

dclus2 |>
  subset(dnum %in% names(clusters[1:10])) |>
  survey::svytotal(x = ~ api99)
#>        total     SE
#> api99 772239 235139
dclus2 |>
  subset(dnum %in% names(clusters[1:10])) |>
  fastsurvey::svytotal(x = ~ api99)
#> Error in arma_multistage(Y = as.matrix(x), samp_unit_ids = samp_unit_ids, : Mat::operator(): index out of bounds

Created on 2022-05-06 by the reprex package (v2.0.0)

Debugging of just fastsurvey::arma_multistage():

x <- structure(c(-0.917252066115703, 1.53729338842975, 0.878202479338843, 
            -5.46270661157025, -4.05361570247934, -1.89452479338843, -1.98543388429752, 
            -2.68997933884298, -2.03088842975207, 5.44638429752066, 4.90092975206612, 
            4.56002066115702, 5.31002066115702, -2.96270661157025, -4.37179752066116, 
            -2.34907024793388, -2.9172520661157, -7.21795454545455, -4.96795454545455, 
            -7.66795454545455, 5.13204545454545, -5.91795454545455, -4.57634297520661, 
            -4.30361570247934, -0.167252066115703, 0.673657024793388, 0.241838842975206, 
            -3.62179752066116, -0.0763429752066119, 5.10547520661157, 5.35547520661157, 
            5.5827479338843, 4.96911157024793, 4.56002066115702, 4.12820247933884, 
            4.03729338842975, 3.90092975206612, 3.8327479338843), .Dim = c(38L, 
                                                                           1L), .Dimnames = list(NULL, "api99"))
samp_unit_ids <- structure(c(83, 83, 83, 132, 132, 132, 152, 152, 152, 173, 173, 
            173, 173, 198, 198, 198, 198, 200, 200, 200, 200, 200, 228, 228, 
            295, 295, 295, 295, 295, 302, 302, 302, 302, 403, 403, 403, 403, 
            403, 111, 110, 109, 1, 3, 2, 4, 5, 6, 10, 8, 9, 7, 14, 11, 13, 
            12, 18, 19, 16, 17, 15, 21, 20, 26, 22, 25, 23, 24, 27, 28, 29, 
            30, 31, 32, 34, 35, 33), .Dim = c(38L, 2L), .Dimnames = list(
              NULL, c("init", "")))
strata_ids <- structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                          29, 29, 29, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 
                          5, 5, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9),
                        .Dim = c(38L, 
                                 2L), .Dimnames = list(NULL, c("init", "")))
strata_samp_sizes <- structure(c(29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 
                                 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 
                                 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 29L, 
                                 29L, 29L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 
                                 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 5L, 5L, 5L, 5L, 5L, 
                                 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L), .Dim = c(38L, 2L))
strata_pop_sizes <- structure(c(757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 
                                757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 
                                757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 757, 
                                757, 757, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 
                                11, 11, 11, 11, 11, 2, 2, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 
                                5, 5), .Dim = c(38L, 2L),
                              .Dimnames = list(c("3", "4", "5", "7", 
                                                 "8", "9", "10", "11", "12", "13", "14", "15", "16", "18", "19", 
                                                 "20", "21", "22", "23", "24", "25", "26", "27", "28", "30", "31", 
                                                 "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", 
                                                 "43"), c("fpc1", "fpc2")))
singleton_method <- 'fail'
use_only_first_stage = FALSE

fastsurvey::arma_multistage(
  Y = x, samp_unit_ids = samp_unit_ids, strata_ids = strata_ids,
  strata_samp_sizes = strata_samp_sizes, strata_pop_sizes = strata_pop_sizes,
  singleton_method = singleton_method, use_only_first_stage = use_only_first_stage
)
#> Error in fastsurvey::arma_multistage(Y = x, samp_unit_ids = samp_unit_ids, : Mat::operator(): index out of bounds

fastsurvey::arma_multistage(
  Y = x, samp_unit_ids = samp_unit_ids, strata_ids = strata_ids,
  strata_samp_sizes = strata_samp_sizes, strata_pop_sizes = strata_pop_sizes,
  singleton_method = singleton_method, use_only_first_stage = use_only_first_stage
)
#> Error in fastsurvey::arma_multistage(Y = x, samp_unit_ids = samp_unit_ids, : Mat::operator(): index out of bounds

# Works if only using first stage
fastsurvey::arma_multistage(
  Y = x, samp_unit_ids = samp_unit_ids, strata_ids = strata_ids,
  strata_samp_sizes = strata_samp_sizes, strata_pop_sizes = strata_pop_sizes,
  singleton_method = singleton_method,
  use_only_first_stage = TRUE
)
#>          [,1]
#> [1,] 2110.225

Created on 2022-05-06 by the reprex package (v2.0.0)

bschneidr commented 2 years ago

The issue was that the loop over first-stage units was using a loop based on the total number of PSUs in the design, not the total number of PSUs in the current subset of data.

      arma::colvec h_first_stage_units = first_stage_ids.elem(h_indices);
      arma::colvec h_unique_psus = unique(h_first_stage_units);
      arma::uword n_h = min(first_stage_strata_samp_sizes.elem(h_indices));
      double N_h = static_cast<double>(min(strata_pop_sizes.elem(h_indices)));

      double f_h;
      if (arma::is_finite(N_h)) {
        f_h = static_cast<double>(n_h) /  N_h;
      } else {
        f_h = 0.0;
      }

      for (arma::uword i=0; i < n_h; ++i ) {

Fixed by changing the loop to the following:

      // Get list of PSUs in the current subset of data, and count them
      arma::colvec h_first_stage_units = first_stage_ids.elem(h_indices);
      arma::colvec h_unique_first_stage_units = unique(h_first_stage_units);
      arma::uword n_h_subset = h_unique_first_stage_units.n_elem;

      for (arma::uword i=0; i < n_h_subset; ++i ) {