Open chainsawriot opened 1 year ago
This is my proposed fix to #170, add a sep parameter that most users don't need to care about.
sep
And it works. If you like this proposal, I would then add tests to it and turn it into a real PR.
content <- "number;country;platform;tandem;csystem_dummy;msystem_rank;csystem_rank;arena_rank;issdriven;prefdriven;charcount;ic_ordinal;ic_categorized;post;main_language;languages;is_reliable;hashtag_count;is_redacted;WC 1;GER;FB II;JJ/SG;1;2;3;2;0;1;16;1;1;Merkel sei Dank!;un;(('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));False;0;False;3 2;GER;FB II;JJ/SG;1;2;3;2;0;1;125;2;2;Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry;de;(('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));True;0;False;23 3;GER;TW;JJ/JW;1;2;3;1;0;1;41;0;0;Mijn maag keert ! #zaventem #grenzendicht;nl;(('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0));True;2;False;5" temp_csv <- tempfile(fileext = ".csv") writeLines(content, temp_csv) data.table::fread(temp_csv) #> number country platform tandem csystem_dummy msystem_rank csystem_rank #> 1: 1 GER FB II JJ/SG 1 2 3 #> 2: 2 GER FB II JJ/SG 1 2 3 #> 3: 3 GER TW JJ/JW 1 2 3 #> arena_rank issdriven prefdriven charcount ic_ordinal ic_categorized #> 1: 2 0 1 16 1 1 #> 2: 2 0 1 125 2 2 #> 3: 1 0 1 41 0 0 #> post #> 1: Merkel sei Dank! #> 2: Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry #> 3: Mijn maag keert ! #zaventem #grenzendicht #> main_language #> 1: un #> 2: de #> 3: nl #> languages #> 1: (('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> 2: (('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> 3: (('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> is_reliable hashtag_count is_redacted WC #> 1: FALSE 0 FALSE 3 #> 2: TRUE 0 FALSE 23 #> 3: TRUE 2 FALSE 5 data.table::fread(temp_csv, sep = ";") #> number country platform tandem csystem_dummy msystem_rank csystem_rank #> 1: 1 GER FB II JJ/SG 1 2 3 #> 2: 2 GER FB II JJ/SG 1 2 3 #> 3: 3 GER TW JJ/JW 1 2 3 #> arena_rank issdriven prefdriven charcount ic_ordinal ic_categorized #> 1: 2 0 1 16 1 1 #> 2: 2 0 1 125 2 2 #> 3: 1 0 1 41 0 0 #> post #> 1: Merkel sei Dank! #> 2: Es war ya auch der Islam der 6 Millionen Jüdische Menschen systematisch etmordet hat. Das hat ya nichts mit euch zu tun sorry #> 3: Mijn maag keert ! #zaventem #grenzendicht #> main_language #> 1: un #> 2: de #> 3: nl #> languages #> 1: (('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> 2: (('GERMAN', 'de', 99, 867.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> 3: (('DUTCH', 'nl', 97, 404.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0)) #> is_reliable hashtag_count is_redacted WC #> 1: FALSE 0 FALSE 3 #> 2: TRUE 0 FALSE 23 #> 3: TRUE 2 FALSE 5 readtext::readtext(temp_csv, text_field = "post") #> Warning in data.table::fread(input = path, data.table = FALSE, stringsAsFactors #> = FALSE, : Detected 1 column names but the data has 12 columns (i.e. invalid #> file). Added 11 extra default column names at the end. #> Error in sort_fields(result, path, text_field): There is no field called post in file /tmp/Rtmp4DlH2c/file6131b3cf7cb2a.csv. readtext::readtext(temp_csv, text_field = "post", sep = ";") #> readtext object consisting of 3 documents and 19 docvars. #> # Description: df [3 × 21] #> doc_id text number country platfo… tandem csyste… msyste… csyste… arena_… #> <chr> <chr> <int> <chr> <chr> <chr> <int> <int> <int> <int> #> 1 file6… "\"M… 1 GER FB II JJ/SG 1 2 3 2 #> 2 file6… "\"E… 2 GER FB II JJ/SG 1 2 3 2 #> 3 file6… "\"M… 3 GER TW JJ/JW 1 2 3 1 #> # … with 11 more variables: issdriven <int>, prefdriven <int>, charcount <int>, #> # ic_ordinal <int>, ic_categorized <int>, main_language <chr>, #> # languages <chr>, is_reliable <lgl>, hashtag_count <int>, is_redacted <lgl>, #> # WC <int> sessionInfo() #> R version 4.2.2 Patched (2022-11-10 r83330) #> Platform: x86_64-pc-linux-gnu (64-bit) #> Running under: Ubuntu 20.04.5 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0 #> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0 #> #> locale: #> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C #> [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=en_US.UTF-8 #> [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8 #> [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C #> [9] LC_ADDRESS=C LC_TELEPHONE=C #> [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> loaded via a namespace (and not attached): #> [1] knitr_1.39 magrittr_2.0.3 R.cache_0.16.0 R6_2.5.1 #> [5] rlang_1.0.6 fastmap_1.1.0 fansi_1.0.3 httr_1.4.4 #> [9] stringr_1.4.1 styler_1.7.0 highr_0.9 tools_4.2.2 #> [13] data.table_1.14.6 xfun_0.31 R.oo_1.25.0 utf8_1.2.2 #> [17] cli_3.4.1 withr_2.5.0 htmltools_0.5.3 yaml_2.3.6 #> [21] digest_0.6.30 tibble_3.1.8 lifecycle_1.0.3 purrr_0.3.5 #> [25] vctrs_0.5.1 R.utils_2.12.0 fs_1.5.2 glue_1.6.2 #> [29] evaluate_0.15 rmarkdown_2.14 reprex_2.0.1 stringi_1.7.8 #> [33] compiler_4.2.2 pillar_1.8.1 R.methodsS3_1.8.2 readtext_0.81 #> [37] pkgconfig_2.0.3
Created on 2022-12-01 by the reprex package (v2.0.1)
This is my proposed fix to #170, add a
sep
parameter that most users don't need to care about.And it works. If you like this proposal, I would then add tests to it and turn it into a real PR.
Created on 2022-12-01 by the reprex package (v2.0.1)