Open kevinwang09 opened 4 years ago
school data
Code/2015/processing2015.R
script, but incorrect in the school_bind_rows.Rmd
file. Parsing 2003 student data using the control files is problematic even with the previous code. See the math, read and science columns.
library(tidyverse)
(stu_control <- read_table("~/Desktop/learningtower_masonry/Data/Raw/2003/PISA2003_SPSS_student.txt", skip = 7,
col_names = c("raw")) %>%
slice(1:404) %>%
tidyr::separate(col = raw,
into = c("colnames", "span1", "dash", "span2", "note"),
sep = "\\s+"))
#> Parsed with column specification:
#> cols(
#> raw = col_character()
#> )
#> Warning: Expected 5 pieces. Missing pieces filled with `NA` in 386 rows [6, 7,
#> 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 26, 27, ...].
#> # A tibble: 404 x 5
#> colnames span1 dash span2 note
#> <chr> <chr> <chr> <chr> <chr>
#> 1 /COUNTRY 1 - 3 (a)
#> 2 CNT 4 - 6 (a)
#> 3 SUBNATIO 7 - 10 (a)
#> 4 SCHOOLID 11 - 15 (a)
#> 5 STIDSTD 16 - 20 (a)
#> 6 ST01Q01 22 - 23 <NA>
#> 7 ST02Q02 24 - 25 <NA>
#> 8 ST02Q03 26 - 27 <NA>
#> 9 ST03Q01 28 - 28 <NA>
#> 10 ST04Q01 29 - 29 <NA>
#> # … with 394 more rows
(var_widths <- stu_control %>%
mutate(widths = as.numeric(span2) - as.numeric(span1) + 1,
names = colnames))
#> # A tibble: 404 x 7
#> colnames span1 dash span2 note widths names
#> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
#> 1 /COUNTRY 1 - 3 (a) 3 /COUNTRY
#> 2 CNT 4 - 6 (a) 3 CNT
#> 3 SUBNATIO 7 - 10 (a) 4 SUBNATIO
#> 4 SCHOOLID 11 - 15 (a) 5 SCHOOLID
#> 5 STIDSTD 16 - 20 (a) 5 STIDSTD
#> 6 ST01Q01 22 - 23 <NA> 2 ST01Q01
#> 7 ST02Q02 24 - 25 <NA> 2 ST02Q02
#> 8 ST02Q03 26 - 27 <NA> 2 ST02Q03
#> 9 ST03Q01 28 - 28 <NA> 1 ST03Q01
#> 10 ST04Q01 29 - 29 <NA> 1 ST04Q01
#> # … with 394 more rows
d_stu <- read_fwf(file="~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt",
col_positions = fwf_widths(var_widths$widths,
col_names = as.character(var_widths$names)))
#> Parsed with column specification:
#> cols(
#> .default = col_double(),
#> `/COUNTRY` = col_character(),
#> CNT = col_character(),
#> SUBNATIO = col_character(),
#> SCHOOLID = col_character(),
#> STIDSTD = col_character(),
#> ST02Q02 = col_character(),
#> ST02Q03 = col_character(),
#> ST09Q01 = col_character(),
#> ST15Q04 = col_character(),
#> ST16Q01 = col_character(),
#> ST21Q01 = col_character(),
#> ST29Q01 = col_character(),
#> ST29Q02 = col_character(),
#> ST29Q03 = col_character(),
#> ST29Q04 = col_character(),
#> ST29Q05 = col_character(),
#> ST29Q06 = col_character(),
#> ST33Q01 = col_character(),
#> ST33Q02 = col_character(),
#> ST33Q03 = col_character()
#> # ... with 97 more columns
#> )
#> See spec(...) for full column specifications.
#> Warning: 332 parsing failures.
#> row col expected actual file
#> 141922 UH 1/0/T/F/TRUE/FALSE 2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141923 UH 1/0/T/F/TRUE/FALSE 2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141924 UH 1/0/T/F/TRUE/FALSE 2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141925 UH 1/0/T/F/TRUE/FALSE 2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141926 UH 1/0/T/F/TRUE/FALSE 2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> ...... ... .................. ...... ....................................................................
#> See problems(...) for more details.
stu_qqq <- d_stu %>%
transmute(country_iso3c = CNT,
school_id = SCHOOLID ,
student_id = STIDSTD,
mother_educ = ST11R01,
father_educ = ST13R01,
gender = ST03Q01,
computer = ST17Q04,
internet = ST17Q06,
math = PV1MATH,
read = PV1READ,
science = PV1SCIE,
stu_wgt = W_FSTUWT,
desk = ST17Q01,
room = ST17Q02,
dishwasher = ST17Q13 ,
television = NA_character_,
computer_n = NA_character_ ,
car = NA_character_,
book = ST19Q01,
wealth = NA_real_,
escs = ESCS)
glimpse(stu_qqq)
#> Rows: 276,165
#> Columns: 21
#> $ country_iso3c <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS",…
#> $ school_id <chr> "00001", "00001", "00001", "00001", "00001", "00001", "…
#> $ student_id <chr> "00001", "00002", "00003", "00004", "00005", "00006", "…
#> $ mother_educ <dbl> 5, 0, 4, 1, 0, 4, 3, 1, 5, 2, 2, 0, 0, 0, 3, 2, 2, 2, 0…
#> $ father_educ <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1…
#> $ gender <dbl> 7, 7, 7, 8, 7, 7, 8, 7, 7, 8, 8, 8, 7, 8, 8, 7, 8, 8, 7…
#> $ computer <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
#> $ internet <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1…
#> $ math <chr> "8 418.086", "2 446.985", "8 501.822", "7 506.807", "2 …
#> $ read <chr> "2 360.180", "5 390.178", "7 489.922", "2 555.226", "4 …
#> $ science <chr> "1 442.836", "8 419.233", "8 545.146", "6 522.723", "2 …
#> $ stu_wgt <chr> "0 5.670", "2 18.902", "1 18.902", "3 18.902", "8 …
#> $ desk <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ room <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ dishwasher <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1…
#> $ television <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ computer_n <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ car <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ book <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7…
#> $ wealth <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ escs <chr> "2 -.7507", "1 .7072", "2 -.5363", "2 -.2628",…
Created on 2020-05-24 by the reprex package (v0.3.0)
student data
[ ] 2003 possession of computer should be "ST17Q04", not "ST17Q03" as documented in the Google Sheets and the Code script: https://github.com/gvdr/learningtower_masonry/blob/05a052414d85551f6324a02e65b46123fa755bf8/Code/2003/read_data2003.R#L68.
[x] 2015 possession of desk should be ST011Q01TA, room should be ST011Q02TA, television should be ST012Q01TA, number of computers should be ST012Q06NA, number of cars should be ST012Q02TA, number of books should be ST013Q01TA, which were not recorded in Google Sheets. (KW did this on 23rd May)
[ ] 2015 and 2018 possession of dishwasher is very poorly stored as "12345" coded for "country - dishwasher - no" for each country and each electrical appliance.