American-Institutes-for-Research / EdSurvey

https://american-institutes-for-research.github.io/EdSurvey/
GNU General Public License v2.0
9 stars 8 forks source link

GetData function subsetting difference #68

Closed burakaydin closed 11 months ago

burakaydin commented 12 months ago

EdSurvey 4.0.1 GetData function creates a data set with less than the expected number of rows. In the below example, I get 42520 rows with the GetData function but 52296 rows when I do it manually. I expect these numbers to be equal.

downloadPISA(
  root="~/",
  years =  2018,
  database = c("INT"),
  cache = FALSE,
  verbose = TRUE
)

countries = c("AUS","COL","CZE","DNK","GEO","MLT","TUR")
pisa18link <- readPISA(path = ".//PISA//2018", database = "INT",
                      countries = countries)
#select relevant Pisa Data
pisa18linkdata=getData(pisa18link,c("cnt","cntryid","cntschid","cntstuid",
                                    "read","math","scie","w_schgrnrabwt","w_fstuwt","schsize","escs","privatesch",
                                    "sc155q01ha","sc155q02ha","sc155q03ha","sc155q04ha","sc155q05ha","sc155q06ha",
                                    "sc155q07ha","sc155q08ha","sc155q09ha","sc155q10ha","sc155q11ha","sc156q01ha",
                                    "sc156q02ha","sc156q03ha","sc156q04ha","sc156q05ha","sc156q06ha","sc156q07ha",
                                    "sc156q08ha","st158q01ha","st158q02ha","st158q03ha","st158q04ha","st158q05ha",
                                    "st158q06ha","st158q07ha","st176q01ia","st176q02ia","st176q03ia","st176q05ia",
                                    "st176q06ia","st176q07ia","st216q01ha","st216q02ha","st216q03ha","st216q04ha",
                                    "st216q05ha","st216q06ha","ic150q01ha","ic150q02ha","ic150q03ha","ic150q04ha",
                                    "ic150q05ha","ic150q06ha","ic150q07ha","ic150q08ha","ic150q09ha","ic152q01ha",
                                    "ic152q02ha","ic152q03ha","ic152q04ha","ic152q05ha","ic152q06ha","ic152q07ha",
                                    "ic152q08ha","ic152q09ha","age","st004d01t"))
pisa18linkdata <-  as.data.frame(do.call(rbind, pisa18linkdata))

#Read school data
pisasch <- data.frame(read_sav("PISA/2018/CY07_MSU_SCH_QQQ.sav"))
pisasch=pisasch[pisasch$CNT%in%countries,]
pisasch=pisasch[,c("CNTSCHID","SCHSIZE","PRIVATESCH","W_SCHGRNRABWT",
                   "SC155Q01HA","SC155Q02HA","SC155Q03HA","SC155Q04HA","SC155Q05HA","SC155Q06HA",
                   "SC155Q07HA","SC155Q08HA","SC155Q09HA","SC155Q10HA","SC155Q11HA","SC156Q01HA",
                   "SC156Q02HA","SC156Q03HA","SC156Q04HA","SC156Q05HA","SC156Q06HA","SC156Q07HA",
                   "SC156Q08HA")]
#Read student cognitive data
pisastu <- read_sav("PISA/2018/CY07_MSU_STU_QQQ.sav")
pisastu=pisastu[pisastu$CNT%in%countries,]
pisastu=pisastu[,c("CNTRYID","CNT","CNTSCHID","CNTSTUID","AGE","ST004D01T","W_FSTUWT" ,
                   "ST158Q01HA","ST158Q02HA","ST158Q03HA","ST158Q04HA","ST158Q05HA",
                   "ST158Q06HA","ST158Q07HA","ST176Q01IA","ST176Q02IA","ST176Q03IA","ST176Q05IA",
                   "ST176Q06IA","ST176Q07IA","ST216Q01HA","ST216Q02HA","ST216Q03HA","ST216Q04HA",
                   "ST216Q05HA","ST216Q06HA","IC150Q01HA","IC150Q02HA","IC150Q03HA","IC150Q04HA",
                   "IC150Q05HA","IC150Q06HA","IC150Q07HA","IC150Q08HA","IC150Q09HA","IC152Q01HA",
                   "IC152Q02HA","IC152Q03HA","IC152Q04HA","IC152Q05HA","IC152Q06HA","IC152Q07HA",
                   "IC152Q08HA","IC152Q09HA", "PV1MATH","PV2MATH","PV3MATH","PV4MATH","PV5MATH","PV6MATH","PV7MATH","PV8MATH","PV9MATH","PV10MATH","PV1READ","PV2READ","PV3READ","PV4READ","PV5READ","PV6READ","PV7READ","PV8READ","PV9READ","PV10READ",
                   "PV1SCIE","PV2SCIE","PV3SCIE","PV4SCIE","PV5SCIE","PV6SCIE","PV7SCIE","PV8SCIE","PV9SCIE","PV10SCIE")]

pisa18linkdataM=merge(pisastu,pisasch,by="CNTSCHID",all = T)
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 11 x64 (build 22000)

Matrix products: default
pdbailey0 commented 11 months ago

@burakaydin did you try setting dropOmittedLevels =FALSE in your getData() call? By default, EdSurvey does listwise deletion (this argument defaults to TRUE).

burakaydin commented 11 months ago

@pdbailey0 Thanks for the reply. I was unsure about the dropOmittedLevels argument or how GetData handles missing data in general. Because dropping omitting levels did not sound like listwise deletion. For a specific school (3600001), I get 17 rows with dropOmittedLevels =FALSE and just 1 row when it is TRUE despite most of the variables were missing completely (100%).
Best