quanteda / readtext

an R package for reading text files
https://readtext.quanteda.io
120 stars 28 forks source link

add `sep` parameter to `readtext` #171

Open chainsawriot opened 1 year ago

chainsawriot commented 1 year ago

This is my proposed fix to #170, add a sep parameter that most users don't need to care about.

And it works. If you like this proposal, I would then add tests to it and turn it into a real PR.

content <- "number;country;platform;tandem;csystem_dummy;msystem_rank;csystem_rank;arena_rank;issdriven;prefdriven;charcount;ic_ordinal;ic_categorized;post;main_language;languages;is_reliable;hashtag_count;is_redacted;WC
1;GER;FB II;JJ/SG;1;2;3;2;0;1;16;1;1;Merkel sei Dank!;un;(('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));False;0;False;3
2;GER;FB II;JJ/SG;1;2;3;2;0;1;125;2;2;Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry;de;(('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));True;0;False;23
3;GER;TW;JJ/JW;1;2;3;1;0;1;41;0;0;Mijn maag keert ! #zaventem #grenzendicht;nl;(('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));True;2;False;5"

temp_csv <- tempfile(fileext = ".csv")
writeLines(content, temp_csv)
data.table::fread(temp_csv)
#>    number country platform tandem csystem_dummy msystem_rank csystem_rank
#> 1:      1     GER    FB II  JJ/SG             1            2            3
#> 2:      2     GER    FB II  JJ/SG             1            2            3
#> 3:      3     GER       TW  JJ/JW             1            2            3
#>    arena_rank issdriven prefdriven charcount ic_ordinal ic_categorized
#> 1:          2         0          1        16          1              1
#> 2:          2         0          1       125          2              2
#> 3:          1         0          1        41          0              0
#>                                                                                                                             post
#> 1:                                                                                                              Merkel sei Dank!
#> 2: Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry
#> 3:                                                                                     Mijn maag keert ! #zaventem #grenzendicht
#>    main_language
#> 1:            un
#> 2:            de
#> 3:            nl
#>                                                                              languages
#> 1:   (('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#> 2: (('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#> 3:  (('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#>    is_reliable hashtag_count is_redacted WC
#> 1:       FALSE             0       FALSE  3
#> 2:        TRUE             0       FALSE 23
#> 3:        TRUE             2       FALSE  5
data.table::fread(temp_csv, sep = ";")
#>    number country platform tandem csystem_dummy msystem_rank csystem_rank
#> 1:      1     GER    FB II  JJ/SG             1            2            3
#> 2:      2     GER    FB II  JJ/SG             1            2            3
#> 3:      3     GER       TW  JJ/JW             1            2            3
#>    arena_rank issdriven prefdriven charcount ic_ordinal ic_categorized
#> 1:          2         0          1        16          1              1
#> 2:          2         0          1       125          2              2
#> 3:          1         0          1        41          0              0
#>                                                                                                                             post
#> 1:                                                                                                              Merkel sei Dank!
#> 2: Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry
#> 3:                                                                                     Mijn maag keert ! #zaventem #grenzendicht
#>    main_language
#> 1:            un
#> 2:            de
#> 3:            nl
#>                                                                              languages
#> 1:   (('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#> 2: (('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#> 3:  (('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
#>    is_reliable hashtag_count is_redacted WC
#> 1:       FALSE             0       FALSE  3
#> 2:        TRUE             0       FALSE 23
#> 3:        TRUE             2       FALSE  5
readtext::readtext(temp_csv, text_field = "post")
#> Warning in data.table::fread(input = path, data.table = FALSE, stringsAsFactors
#> = FALSE, : Detected 1 column names but the data has 12 columns (i.e. invalid
#> file). Added 11 extra default column names at the end.
#> Error in sort_fields(result, path, text_field): There is no field called post in file /tmp/Rtmp4DlH2c/file6131b3cf7cb2a.csv.
readtext::readtext(temp_csv, text_field = "post", sep = ";")
#> readtext object consisting of 3 documents and 19 docvars.
#> # Description: df [3 × 21]
#>   doc_id text  number country platfo… tandem csyste… msyste… csyste… arena_…
#>   <chr>  <chr>  <int> <chr>   <chr>   <chr>    <int>   <int>   <int>   <int>
#> 1 file6… "\"M…      1 GER     FB II   JJ/SG        1       2       3       2
#> 2 file6… "\"E…      2 GER     FB II   JJ/SG        1       2       3       2
#> 3 file6… "\"M…      3 GER     TW      JJ/JW        1       2       3       1
#> # … with 11 more variables: issdriven <int>, prefdriven <int>, charcount <int>,
#> #   ic_ordinal <int>, ic_categorized <int>, main_language <chr>,
#> #   languages <chr>, is_reliable <lgl>, hashtag_count <int>, is_redacted <lgl>,
#> #   WC <int>

sessionInfo()
#> R version 4.2.2 Patched (2022-11-10 r83330)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.5 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> loaded via a namespace (and not attached):
#>  [1] knitr_1.39        magrittr_2.0.3    R.cache_0.16.0    R6_2.5.1         
#>  [5] rlang_1.0.6       fastmap_1.1.0     fansi_1.0.3       httr_1.4.4       
#>  [9] stringr_1.4.1     styler_1.7.0      highr_0.9         tools_4.2.2      
#> [13] data.table_1.14.6 xfun_0.31         R.oo_1.25.0       utf8_1.2.2       
#> [17] cli_3.4.1         withr_2.5.0       htmltools_0.5.3   yaml_2.3.6       
#> [21] digest_0.6.30     tibble_3.1.8      lifecycle_1.0.3   purrr_0.3.5      
#> [25] vctrs_0.5.1       R.utils_2.12.0    fs_1.5.2          glue_1.6.2       
#> [29] evaluate_0.15     rmarkdown_2.14    reprex_2.0.1      stringi_1.7.8    
#> [33] compiler_4.2.2    pillar_1.8.1      R.methodsS3_1.8.2 readtext_0.81    
#> [37] pkgconfig_2.0.3

Created on 2022-12-01 by the reprex package (v2.0.1)