tmatta / lsasim

Simulate large scale assessment data
6 stars 5 forks source link

cor_matrix & rho #19

Closed wleoncio closed 3 years ago

wleoncio commented 3 years ago

0. Setup

I've tested most values below. Not all testings are shown in this report. I only included the testings that are showing errors/warnings or inconsistent results.

cluster_gen_2 <- function(...) {
  cluster_gen(..., verbose = FALSE, calc_weights = FALSE)
}
set.seed(12334)
n1 <- c(3, 6)
n2 <- c(groups = 4, people = 2)
n3 <- c(school = 3, class = 2, student = 5)
n4 <- c(20, 50)
n5 <- list(school = 3, class = c(2, 1, 3), student = c(20, 20, 10, 30, 30, 30))
n5a <- list(school = 3, class = c(2, 3, 3), student = c(20, 20, 10, 30, 30, 30))
n6 <- list(school = 3, class = c(2, 1, 3), student = ranges(10, 50))
n6a <- list(school = 3, class = c(2, 3, 3), student = ranges(10, 50))
n7 <- list(school = 10, student = ranges(10, 50))
n8 <- list(school = 3, student = c(20, 20, 10))
n8a <- list(school = 3, class = c(2, 2, 2),student = c(20, 20, 10))
n8b <- list(school = 3, class = c(2, 3, 3),student = c(20, 20, 10, 5))
n8c <- list(school = 3, class = c(2, 1, 3),student = c(20, 20, 10))
n9 <- list(school = 10, class = c(2,1,3,1,1,1,2,1,2,1), student = ranges(10, 50))
n10 <- list(country = 2, school = 10, class = c(2,1,3,1,1,1,2,1,2,1), student = ranges(10, 50))
n11 <- list(culture = 2, country = 2, school = 10, class = c(2,1,3,1,1,1,2,1,2,1), student = ranges(10, 50))
n12 <- list(culture = 2, country = 2, district = 3, school = 10, class = c(2,1,3,1,1,1,2,1,2,1), student = ranges(10, 50))
N1 <- c(100, 20)

9. cor_matrix & rho

Error and warning messages

# m1 <- matrix(c(1, 0.2, 0.3, 0.4,
#                0.2, 1, 0.5, 0.7,
#                0.3, 0.5, 1, 0.8,
#                0.4, 0.7, 0.8, 1), 4, 4)
# m2 <- matrix(c(1, 0.5, 0.6,
#                0.5, 1, 0.9,
#                0.6, 0.9, 1), 3, 3)
# m3 <- matrix(c(1, 0.55, 0.77,
#                0.55, 1, 0.33,
#                0.77, 0.33, 1), 3, 3)
# m4 <- matrix(c(1, 0.55,
#                0.55, 1), 2, 2)
set.seed(12334)
cr1 <- cluster_gen_2(n7, n_X=2, n_W=2, cor_matrix=m1, rho=c(0.1, 0.2))
summarize_clusters(cr1)
## Summary statistics for all schools
##        q1                 q2          q3       q4
##  Min.   :-4.14411   Min.   :-6.1643   1:8895   1:8654
##  Mean   :-0.04916   Mean   :-0.2228   2:5096   2:8668
##  Max.   : 4.33602   Max.   : 5.6062   3:6864   3:3709
##                                       4:1869   4:3652
##  Stddev.: 1.03      Stddev.: 1.18     5:4008   5:2049
##
##                                       Prop.    Prop.
##                                       1:0.333  1:0.324
##                                       2:0.191  2:0.324
##                                       3:0.257  3:0.139
##                                       4:0.07   4:0.137
##                                       5:0.15   5:0.077
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2        q3        q4
## q1 1.0000000 0.1571716 0.2417537 0.2573320
## q2 0.1571716 1.0000000 0.4012934 0.5134506
## q3 0.2417537 0.4012934 1.0000000 0.4682713
## q4 0.2573320 0.5134506 0.4682713 1.0000000
#          q1        q2        q3        q4
#q1 1.0000000 0.1571716 0.2417537 0.2573320
#q2 0.1571716 1.0000000 0.4012934 0.5134506
#q3 0.2417537 0.4012934 1.0000000 0.4682713
#q4 0.2573320 0.5134506 0.4682713 1.0000000
anova_table(cr1) #ICC: q1 0.055, q2, 0.238
## Warning in anova_table(cr1): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance       1.01410490          1.01410490
## 2 Between-group variance       0.05999524          0.05961035
## 3         Total variance       1.06698575                  NA
##
## Intraclass correlation
##     Estimated Standard.error
## q1 0.05551784             NA
##
## Testing for group differences
## F-statistic: 155.8775 on 9 and 26722 DF. p-value:  8.16664e-289
## Warning in anova_table(cr1): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q2
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0853044           1.0853044
## 2 Between-group variance        0.3406284           0.3402164
## 3         Total variance        1.3871134                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q2 0.2386611             NA
##
## Testing for group differences
## F-statistic: 826.949 on 9 and 26722 DF. p-value:  0
set.seed(12334)
cr2 <- cluster_gen_2(n7, n_X=1, n_W=2, cor_matrix=m2, rho=0.7)
summarize_clusters(cr2)
## Summary statistics for all schools
##        q1          q2       q3
##  Min.   :-6.1056   1:6820   1:9170
##  Mean   :-0.4056   2:7845   2:7171
##  Max.   : 4.9150   3:6612   3:6295
##                    4:2664   4:4096
##  Stddev.: 1.43     5:2791
##                             Prop.
##                    Prop.    1:0.343
##                    1:0.255  2:0.268
##                    2:0.293  3:0.235
##                    3:0.247  4:0.153
##                    4:0.1
##                    5:0.104
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2        q3
## q1 1.0000000 0.3517538 0.2260297
## q2 0.3517538 1.0000000 0.6761723
## q3 0.2260297 0.6761723 1.0000000
#           q1        q2        q3
# q1 1.0000000 0.3517538 0.2260297
# q2 0.3517538 1.0000000 0.6761723
# q3 0.2260297 0.6761723 1.0000000

anova_table(cr2) #0.7802965
## Warning in anova_table(cr2): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance         1.032129            1.032129
## 2 Between-group variance         1.127728            1.127336
## 3         Total variance         2.032199                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q1 0.5220442             NA
##
## Testing for group differences
## F-statistic: 2878.856 on 9 and 26722 DF. p-value:  0
#Error: SE not yet implemented for different sample sizes
#ANOVA table for schools, q1

set.seed(12334)
cr3 <- cluster_gen_2(n7, n_X=2, n_W=1, cor_matrix=m3, rho=c(0.3, 0.4))
summarize_clusters(cr3)
## Summary statistics for all schools
##        q1                q2          q3
##  Min.   :-4.0813   Min.   :-5.1206   1:7886
##  Mean   : 0.3268   Mean   :-0.2999   2:7741
##  Max.   : 4.5985   Max.   : 4.4999   3:5253
##                                      4:3717
##  Stddev.: 1.12     Stddev.: 1.31     5:2135
##
##                                      Prop.
##                                      1:0.295
##                                      2:0.29
##                                      3:0.197
##                                      4:0.139
##                                      5:0.08
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2        q3
## q1 1.0000000 0.3884013 0.6065154
## q2 0.3884013 1.0000000 0.2371039
## q3 0.6065154 0.2371039 1.0000000
#           q1        q2        q3
# q1 1.0000000 0.3884013 0.6065154
# q2 0.3884013 1.0000000 0.2371039
# q3 0.6065154 0.2371039 1.0000000

anova_table(cr3)
## Warning in anova_table(cr3): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0197526           1.0197526
## 2 Between-group variance        0.2703212           0.2699342
## 3         Total variance        1.2592135                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q1 0.2093021             NA
##
## Testing for group differences
## F-statistic: 698.4491 on 9 and 26722 DF. p-value:  0
## Warning in anova_table(cr3): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q2
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0909545           1.0909545
## 2 Between-group variance        0.7160194           0.7156053
## 3         Total variance        1.7257741                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q2 0.3961149             NA
##
## Testing for group differences
## F-statistic: 1729.289 on 9 and 26722 DF. p-value:  0
set.seed(12334)
cr4 <- cluster_gen_2(n7, n_X=2, n_W=0, cor_matrix=m4, rho=c(0.15, 0.25))
summarize_clusters(cr4) #0.4162775
## Summary statistics for all schools
##        q1                 q2
##  Min.   :-4.41397   Min.   :-4.8355
##  Mean   : 0.05774   Mean   : 0.2351
##  Max.   : 4.70216   Max.   : 5.0581
##
##  Stddev.: 1.08      Stddev.: 1.33
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2
## q1 1.0000000 0.4162775
## q2 0.4162775 1.0000000
anova_table(cr4)
## Warning in anova_table(cr4): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0157789           1.0157789
## 2 Between-group variance        0.1752731           0.1748876
## 3         Total variance        1.1709232                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q1 0.1468821             NA
##
## Testing for group differences
## F-statistic: 454.6378 on 9 and 26722 DF. p-value:  0
## Warning in anova_table(cr4): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q2
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0828475           1.0828475
## 2 Between-group variance        0.7684375           0.7680265
## 3         Total variance        1.7641705                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q2 0.4149534             NA
##
## Testing for group differences
## F-statistic: 1869.781 on 9 and 26722 DF. p-value:  0
set.seed(12334)
cr5 <- cluster_gen_2(n7, n_X=1, n_W=1, cor_matrix=m4, rho=0.25)
summarize_clusters(cr5) #0.3119007 , if not setting the seed as 12334, the cor_matrix is 0.3675987
## Summary statistics for all schools
##        q1          q2
##  Min.   :-3.9530   1:13718
##  Mean   : 0.1926   2: 5265
##  Max.   : 4.4597   3: 4275
##                    4: 3152
##  Stddev.: 1.2      5:  322
##
##                    Prop.
##                    1:0.513
##                    2:0.197
##                    3:0.16
##                    4:0.118
##                    5:0.012
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2
## q1 1.0000000 0.3119007
## q2 0.3119007 1.0000000
anova_table(cr5) #rho: 0.3240467
## Warning in anova_table(cr5): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        1.0072533           1.0072533
## 2 Between-group variance        0.4832517           0.4828694
## 3         Total variance        1.4356109                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q1 0.3240467             NA
##
## Testing for group differences
## F-statistic: 1264.108 on 9 and 26722 DF. p-value:  0
set.seed(12334)
cr6 <- cluster_gen_2(n1, n_X=2, n_W=2, cor_matrix=m1, rho=c(0.1, 0.2))
summarize_clusters(cr6)
## Summary statistics for all schools
##        q1                q2          q3      q4
##  Min.   :-1.4350   Min.   :-2.4610   1:8     1:6
##  Mean   : 0.8688   Mean   : 0.5610   2:8     2:1
##  Max.   : 2.4866   Max.   : 3.2177   3:1     3:5
##                                      5:1     4:3
##  Stddev.: 0.94     Stddev.: 1.63             5:3
##                                      Prop.
##                                      1:0.444 Prop.
##                                      2:0.444 1:0.333
##                                      3:0.056 2:0.056
##                                      5:0.056 3:0.278
##                                              4:0.167
##                                              5:0.167
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2        q3        q4
## q1 1.0000000 0.3084669 0.1547741 0.1977849
## q2 0.3084669 1.0000000 0.3693761 0.7297771
## q3 0.1547741 0.3693761 1.0000000 0.5372164
## q4 0.1977849 0.7297771 0.5372164 1.0000000
#           q1        q2        q3        q4
# q1 1.0000000 0.3084669 0.1547741 0.1977849
# q2 0.3084669 1.0000000 0.3693761 0.7297771
# q3 0.1547741 0.3693761 1.0000000 0.5372164
# q4 0.1977849 0.7297771 0.5372164 1.0000000

anova_table(cr6) #ICC: 0.03227154   0.5125171
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance        0.8550507          0.85505074
## 2 Between-group variance        0.1710224          0.02851399
## 3         Total variance        0.8751783                  NA
##
## Intraclass correlation
##     Estimated Standard.error
## q1 0.03227154      0.2051913
##
## Testing for group differences
## F-statistic: 1.200086 on 2 and 15 DF. p-value:  0.328498
##
## ANOVA table for schools, q2
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance         1.525205            1.525205
## 2 Between-group variance         1.857731            1.603530
## 3         Total variance         2.657109                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q2 0.5125171      0.3170765
##
## Testing for group differences
## F-statistic: 7.308123 on 2 and 15 DF. p-value:  0.006084282
set.seed(12334)
cr7 <- cluster_gen_2(n7, n_X=1, n_W=2, cor_matrix=m2, rho=0.7)
#           q1        q2        q3
# q1 1.0000000 0.3517538 0.2260297
# q2 0.3517538 1.0000000 0.6761723
# q3 0.2260297 0.6761723 1.0000000
summarize_clusters(cr7)
## Summary statistics for all schools
##        q1          q2       q3
##  Min.   :-6.1056   1:6820   1:9170
##  Mean   :-0.4056   2:7845   2:7171
##  Max.   : 4.9150   3:6612   3:6295
##                    4:2664   4:4096
##  Stddev.: 1.43     5:2791
##                             Prop.
##                    Prop.    1:0.343
##                    1:0.255  2:0.268
##                    2:0.293  3:0.235
##                    3:0.247  4:0.153
##                    4:0.1
##                    5:0.104
##
##
##
##  Heterogeneous correlation matrix
##           q1        q2        q3
## q1 1.0000000 0.3517538 0.2260297
## q2 0.3517538 1.0000000 0.6761723
## q3 0.2260297 0.6761723 1.0000000
anova_table(cr7) #ICC: 0.5220442
## Warning in anova_table(cr7): SE not yet implemented for different sample sizes
##
## ANOVA table for schools, q1
## ANOVA estimators
##                   Source Sample.statistic Population.estimate
## 1  Within-group variance         1.032129            1.032129
## 2 Between-group variance         1.127728            1.127336
## 3         Total variance         2.032199                  NA
##
## Intraclass correlation
##    Estimated Standard.error
## q1 0.5220442             NA
##
## Testing for group differences
## F-statistic: 2878.856 on 9 and 26722 DF. p-value:  0
cr13 <- cluster_gen_2(n13, n_X=1, n_W=2, cor_matrix=m2, rho=0.7)
## Error in check_condition((!is.null(names(n)) | !is.null(names(N))) & (!is.null(cluster_labels) | : object 'n13' not found
#This uses a sample that is not specified, and the error message is:
#Error in check_condition((!is.null(names(n)) | !is.null(names(N))) & (!is.null(cluster_labels) | : object 'n13' not found
wleoncio commented 3 years ago
  1. SE not yet implemented for different sample sizes ANOVA table for schools, q1

This is also pointed out on issue #22. The warning is expected and will not be addressed on the next release.

wleoncio commented 3 years ago
  1. When setting “cor_matrix” and “rho” together, there’re cases when true values and estimated values having a large difference.

This behavior is expected given the small sample size of 10 schools in n7. On average, however, those numbers converge to the expected values. See, for example, how the following code:

sample_size <- 1000
rho_hat <- matrix(nrow=sample_size, ncol=2)
for (i in seq_len(sample_size)) {
    cr1 <- cluster_gen_2(n7, n_X=2, n_W=2, cor_matrix=m1, rho=c(0.1, 0.2))
    rho_hat[i, ] <- c(
        anova_table(cr1, calc.se=FALSE, print=FALSE)$population$q1[[3]],
        anova_table(cr1, calc.se=FALSE, print=FALSE)$population$q2[[3]]
    )
}
apply(rho_hat, 2, mean)

generates the following output (your values may change slightly):

[1] 0.1268647 0.2156342

The code above is basically 1000 replications of the same cr1 generation procedure.

Another example: see how increasing the sample size (instead of the number of replications) also results in converging rhos:

n7_200 <- list(school = 200, student = ranges(10, 50))
cr1_200 <- cluster_gen_2(n7_200, n_X=2, n_W=2, cor_matrix=m1, rho=c(0.1, 0.2))
anova_table(cr1_200, calc.se=FALSE)
ANOVA table for schools, q1
ANOVA estimators
                  Source Sample.statistic Population.estimate
1  Within-group variance        3.6073815           3.6073815
2 Between-group variance        0.7638303           0.6042921
3         Total variance        4.2082910                  NA

Intraclass correlation
   Estimated Standard.error
q1 0.1434803             NA

Testing for group differences
F-statistic: 4.78776 on 199 and 4326 DF. p-value:  2.395365e-86 

ANOVA table for schools, q2
ANOVA estimators
                  Source Sample.statistic Population.estimate
1  Within-group variance        0.6380039           0.6380039
2 Between-group variance        0.2504619           0.2222459
3         Total variance        0.8590057                  NA

Intraclass correlation
   Estimated Standard.error
q2 0.2583505             NA

Testing for group differences
F-statistic: 8.876588 on 199 and 4326 DF. p-value:  8.724305e-201 
wleoncio commented 3 years ago

cr13 <- cluster_gen_2(n13, n_X=1, n_W=2, cor_matrix=m2, rho=0.7) Error in check_condition((!is.null(names(n)) | !is.null(names(N))) & (!is.null(cluster_labels) | : object 'n13' not found

This is expected since the n13 object is missing (it should be created by the user, since it's being manually plugged into cluster_gen_2().