gvdr / learningtower_masonry

Infrastructure to build the learningtower
GNU General Public License v3.0
0 stars 0 forks source link

Collection of issues on manually cleaning student and school data #7

Open kevinwang09 opened 4 years ago

kevinwang09 commented 4 years ago

student data

kevinwang09 commented 4 years ago

school data

kevinwang09 commented 4 years ago

Parsing 2003 student data using the control files is problematic even with the previous code. See the math, read and science columns.

library(tidyverse)

(stu_control <- read_table("~/Desktop/learningtower_masonry/Data/Raw/2003/PISA2003_SPSS_student.txt", skip = 7,
                          col_names = c("raw")) %>%
    slice(1:404) %>%
    tidyr::separate(col = raw,
                    into = c("colnames", "span1", "dash", "span2", "note"),
                    sep = "\\s+"))
#> Parsed with column specification:
#> cols(
#>   raw = col_character()
#> )
#> Warning: Expected 5 pieces. Missing pieces filled with `NA` in 386 rows [6, 7,
#> 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 26, 27, ...].
#> # A tibble: 404 x 5
#>    colnames span1 dash  span2 note 
#>    <chr>    <chr> <chr> <chr> <chr>
#>  1 /COUNTRY 1     -     3     (a)  
#>  2 CNT      4     -     6     (a)  
#>  3 SUBNATIO 7     -     10    (a)  
#>  4 SCHOOLID 11    -     15    (a)  
#>  5 STIDSTD  16    -     20    (a)  
#>  6 ST01Q01  22    -     23    <NA> 
#>  7 ST02Q02  24    -     25    <NA> 
#>  8 ST02Q03  26    -     27    <NA> 
#>  9 ST03Q01  28    -     28    <NA> 
#> 10 ST04Q01  29    -     29    <NA> 
#> # … with 394 more rows

(var_widths <- stu_control %>%
    mutate(widths = as.numeric(span2) - as.numeric(span1) + 1, 
           names = colnames))
#> # A tibble: 404 x 7
#>    colnames span1 dash  span2 note  widths names   
#>    <chr>    <chr> <chr> <chr> <chr>  <dbl> <chr>   
#>  1 /COUNTRY 1     -     3     (a)        3 /COUNTRY
#>  2 CNT      4     -     6     (a)        3 CNT     
#>  3 SUBNATIO 7     -     10    (a)        4 SUBNATIO
#>  4 SCHOOLID 11    -     15    (a)        5 SCHOOLID
#>  5 STIDSTD  16    -     20    (a)        5 STIDSTD 
#>  6 ST01Q01  22    -     23    <NA>       2 ST01Q01 
#>  7 ST02Q02  24    -     25    <NA>       2 ST02Q02 
#>  8 ST02Q03  26    -     27    <NA>       2 ST02Q03 
#>  9 ST03Q01  28    -     28    <NA>       1 ST03Q01 
#> 10 ST04Q01  29    -     29    <NA>       1 ST04Q01 
#> # … with 394 more rows

d_stu <- read_fwf(file="~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt", 
                  col_positions = fwf_widths(var_widths$widths, 
                                             col_names = as.character(var_widths$names)))
#> Parsed with column specification:
#> cols(
#>   .default = col_double(),
#>   `/COUNTRY` = col_character(),
#>   CNT = col_character(),
#>   SUBNATIO = col_character(),
#>   SCHOOLID = col_character(),
#>   STIDSTD = col_character(),
#>   ST02Q02 = col_character(),
#>   ST02Q03 = col_character(),
#>   ST09Q01 = col_character(),
#>   ST15Q04 = col_character(),
#>   ST16Q01 = col_character(),
#>   ST21Q01 = col_character(),
#>   ST29Q01 = col_character(),
#>   ST29Q02 = col_character(),
#>   ST29Q03 = col_character(),
#>   ST29Q04 = col_character(),
#>   ST29Q05 = col_character(),
#>   ST29Q06 = col_character(),
#>   ST33Q01 = col_character(),
#>   ST33Q02 = col_character(),
#>   ST33Q03 = col_character()
#>   # ... with 97 more columns
#> )
#> See spec(...) for full column specifications.
#> Warning: 332 parsing failures.
#>    row col           expected actual                                                                 file
#> 141922  UH 1/0/T/F/TRUE/FALSE      2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141923  UH 1/0/T/F/TRUE/FALSE      2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141924  UH 1/0/T/F/TRUE/FALSE      2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141925  UH 1/0/T/F/TRUE/FALSE      2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> 141926  UH 1/0/T/F/TRUE/FALSE      2 '~/Desktop/learningtower_masonry/Data/Raw/2003/INT_stui_2003_v2.txt'
#> ...... ... .................. ...... ....................................................................
#> See problems(...) for more details.

stu_qqq <- d_stu %>% 
  transmute(country_iso3c = CNT,
         school_id = SCHOOLID ,
         student_id = STIDSTD,
         mother_educ = ST11R01,
         father_educ = ST13R01,
         gender = ST03Q01,
         computer = ST17Q04,
         internet = ST17Q06,
         math = PV1MATH,
         read = PV1READ,
         science = PV1SCIE,
         stu_wgt = W_FSTUWT,
         desk = ST17Q01,
         room = ST17Q02,
         dishwasher = ST17Q13 ,
         television = NA_character_,
         computer_n = NA_character_ ,
         car = NA_character_,
         book = ST19Q01,
         wealth = NA_real_,
         escs = ESCS)

glimpse(stu_qqq)
#> Rows: 276,165
#> Columns: 21
#> $ country_iso3c <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS",…
#> $ school_id     <chr> "00001", "00001", "00001", "00001", "00001", "00001", "…
#> $ student_id    <chr> "00001", "00002", "00003", "00004", "00005", "00006", "…
#> $ mother_educ   <dbl> 5, 0, 4, 1, 0, 4, 3, 1, 5, 2, 2, 0, 0, 0, 3, 2, 2, 2, 0…
#> $ father_educ   <dbl> 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1…
#> $ gender        <dbl> 7, 7, 7, 8, 7, 7, 8, 7, 7, 8, 8, 8, 7, 8, 8, 7, 8, 8, 7…
#> $ computer      <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
#> $ internet      <dbl> 2, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 1…
#> $ math          <chr> "8 418.086", "2 446.985", "8 501.822", "7 506.807", "2 …
#> $ read          <chr> "2 360.180", "5 390.178", "7 489.922", "2 555.226", "4 …
#> $ science       <chr> "1 442.836", "8 419.233", "8 545.146", "6 522.723", "2 …
#> $ stu_wgt       <chr> "0   5.670", "2  18.902", "1  18.902", "3  18.902", "8 …
#> $ desk          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ room          <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ dishwasher    <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1…
#> $ television    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ computer_n    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ car           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ book          <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7…
#> $ wealth        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
#> $ escs          <chr> "2   -.7507", "1    .7072", "2   -.5363", "2   -.2628",…

Created on 2020-05-24 by the reprex package (v0.3.0)