Open PolMine opened 5 years ago
The function does work now on Windows without a crash. Using the cwb_win repo with cross-compiled CWB utilities (cwb-huffcode.exe here), I checked that RcppCWB and ordinary CWB have the same result. This is good to know. However, Windows and macOS files differ:
-rw-r--r-- 1 andreasblaette staff 3972 2 Feb 13:11 macos_word.huf.syn -rw-r--r--@ 1 andreasblaette staff 3983 2 Feb 13:01 cwb_word.huf.syn -rw-r--r--@ 1 andreasblaette staff 3983 2 Feb 13:01 rcppcwb_word.huf.syn
xxd -b macos_word.huf.syn | less
00000000: 00000000 00000000 00000000 00000000 00000000 00000000 ......
00000006: 00000000 10001101 00000000 00000000 00000001 00100101 .....%
0000000c: 00000000 00000000 00000001 10111101 00000000 00000000 ......
00000012: 00000010 01010001 00000000 00000000 00000010 11011110 .Q....
00000018: 00000000 00000000 00000011 01110100 00000000 00000000 ...t..
0000001e: 00000100 00000010 00000000 00000000 00000100 10010111 ......
00000024: 00000000 00000000 00000101 00100101 00000000 00000000 ...%..
0000002a: 00000101 10110101 00000000 00000000 00000110 01001001 .....I
00000030: 00000000 00000000 00000110 11011011 00000000 00000000 ......
00000036: 00000111 01110111 00000000 00000000 00001000 00001110 .w....
0000003c: 00000000 00000000 00001000 10100000 00000000 00000000 ......
00000042: 00001001 00110111 00000000 00000000 00001001 11001101 .7....
00000048: 00000000 00000000 00001010 01100011 00000000 00000000 ...c..
0000004e: 00001010 11110101 00000000 00000000 00001011 10001000 ......
00000054: 00000000 00000000 00001100 00010110 00000000 00000000 ......
0000005a: 00001100 10100011 00000000 00000000 00001101 00111001 .....9
00000060: 00000000 00000000 00001101 11001101 00000000 00000000 ......
00000066: 00001110 01100101 00000000 00000000 00001110 11111011 .e....
0000006c: 00000000 00000000 00001111 10001110 00000000 00000000 ......
00000072: 00010000 00101101 00000000 00000000 00010000 11000110 .-....
00000078: 00000000 00000000 00010001 01100101 00000000 00000000 ...e..
0000007e: 00010010 00000110 00000000 00000000 00010010 10011111 ......
00000084: 00000000 00000000 00010011 01001001 00000000 00000000 ...I..
0000008a: 00010011 11100000 00000000 00000000 00010100 01110000 .....p
00000090: 00000000 00000000 00010101 00001010 00000000 00000000 ......
00000096: 00010101 10101100 00000000 00000000 00010110 00111111 .....?
0000009c: 00000000 00000000 00010110 11001100 00000000 00000000 ......
000000a2: 00010111 01100010 00000000 00000000 00010111 11111101 .b....
000000a8: 00000000 00000000 00011000 10001111 00000000 00000000 ......
000000ae: 00011001 00100011 00000000 00000000 00011001 10110101 .#....
000000b4: 00000000 00000000 00011010 01001101 00000000 00000000 ...M..
000000ba: 00011010 11100010 00000000 00000000 00011011 01110101 .....u
000000c0: 00000000 00000000 00011100 00001100 00000000 00000000 ......
000000c6: 00011100 10100110 00000000 00000000 00011101 01000000 .....@
000000cc: 00000000 00000000 00011101 11010110 00000000 00000000 ......
000000d2: 00011110 01101001 00000000 00000000 00011110 11111110 .i....
000000d8: 00000000 00000000 00011111 10100001 00000000 00000000 ......
000000de: 00100000 01000111 00000000 00000000 00100000 11100100 G.. .
000000e4: 00000000 00000000 00100001 10001000 00000000 00000000 ..!...
000000ea: 00100010 00101100 00000000 00000000 00100010 10111110 ",..".
xxd -b cwb_word.huf.syn | less
00000000: 00000000 00000000 00000000 00000000 00000000 00000000 ......
00000006: 00000000 10001101 00000000 00000000 00000001 00100101 .....%
0000000c: 00000000 00000000 00000001 10111101 00000000 00000000 ......
00000012: 00000010 01010001 00000000 00000000 00000010 11011110 .Q....
00000018: 00000000 00000000 00000011 01110100 00000000 00000000 ...t..
0000001e: 00000100 00000010 00000000 00000000 00000100 10010111 ......
00000024: 00000000 00000000 00000101 00100101 00000000 00000000 ...%..
0000002a: 00000101 10110101 00000000 00000000 00000110 01001001 .....I
00000030: 00000000 00000000 00000110 11011011 00000000 00000000 ......
00000036: 00000111 01110111 00000000 00000000 00001000 00001110 .w....
0000003c: 00000000 00000000 00001000 10100000 00000000 00000000 ......
00000042: 00001001 00110111 00000000 00000000 00001001 11001101 .7....
00000048: 00000000 00000000 00001101 00001010 01100011 00000000 ....c.
0000004e: 00000000 00001101 00001010 11110101 00000000 00000000 ......
00000054: 00001011 10001000 00000000 00000000 00001100 00010110 ......
0000005a: 00000000 00000000 00001100 10100011 00000000 00000000 ......
00000060: 00001101 00111001 00000000 00000000 00001101 11001101 .9....
00000066: 00000000 00000000 00001110 01100101 00000000 00000000 ...e..
0000006c: 00001110 11111011 00000000 00000000 00001111 10001110 ......
00000072: 00000000 00000000 00010000 00101101 00000000 00000000 ...-..
00000078: 00010000 11000110 00000000 00000000 00010001 01100101 .....e
0000007e: 00000000 00000000 00010010 00000110 00000000 00000000 ......
00000084: 00010010 10011111 00000000 00000000 00010011 01001001 .....I
0000008a: 00000000 00000000 00010011 11100000 00000000 00000000 ......
00000090: 00010100 01110000 00000000 00000000 00010101 00001101 .p....
00000096: 00001010 00000000 00000000 00010101 10101100 00000000 ......
0000009c: 00000000 00010110 00111111 00000000 00000000 00010110 ..?...
000000a2: 11001100 00000000 00000000 00010111 01100010 00000000 ....b.
000000a8: 00000000 00010111 11111101 00000000 00000000 00011000 ......
000000ae: 10001111 00000000 00000000 00011001 00100011 00000000 ....#.
000000b4: 00000000 00011001 10110101 00000000 00000000 00011010 ......
000000ba: 01001101 00000000 00000000 00011010 11100010 00000000 M.....
000000c0: 00000000 00011011 01110101 00000000 00000000 00011100 ..u...
000000c6: 00001100 00000000 00000000 00011100 10100110 00000000 ......
000000cc: 00000000 00011101 01000000 00000000 00000000 00011101 ..@...
000000d2: 11010110 00000000 00000000 00011110 01101001 00000000 ....i.
000000d8: 00000000 00011110 11111110 00000000 00000000 00011111 ......
000000de: 10100001 00000000 00000000 00100000 01000111 00000000 ... G.
000000e4: 00000000 00100000 11100100 00000000 00000000 00100001 . ...!
000000ea: 10001000 00000000 00000000 00100010 00101100 00000000 ...",.
This is an example I have used to understand when and how corpus compression crashes. When encoding the REUTERS corpus, cl_cpos2id()
crashes consistently for cpos = 2432, irrespective from the encoding method (CWB or R). See the following code to see why we see the crash: But I do not yet grasp a/the pattern.
library(cwbtools)
library(fs)
library(RcppCWB)
# cwb_install()
registry_tmp <- fs::path(tempdir(), "registry")
dir.create (registry_tmp)
data_dir_tmp <- fs::path(tempdir(), "data_dir", "reuters")
dir.create(data_dir_tmp, recursive = TRUE)
token_stream <- readLines(system.file(package = "RcppCWB", "extdata", "examples", "reuters.txt"))
p_attribute_encode(
token_stream = token_stream,
registry_dir = registry_tmp,
corpus = "REUTERS",
data_dir = data_dir_tmp,
method = "R",
verbose = TRUE,
quietly = FALSE,
encoding = "utf8",
compress = TRUE
)
cl_cpos2id("REUTERS", p_attribute = "word", cpos = 2430, registry = registry_tmp) # 366
cl_cpos2id("REUTERS", p_attribute = "word", cpos = 2431, registry = registry_tmp) # 83
cl_cpos2id("REUTERS", p_attribute = "word", cpos = 2432, registry = registry_tmp) # fails
names(token_stream) <- as.character(0:(length(token_stream) - 1))
token_stream[2430:2440]
cl_str2id(corpus = "REUTERS", p_attribute = "word", str = "emirate's", registry = registry_tmp) # 891
cl_id2cpos(corpus = "REUTERS", p_attribute = "word", id = 891, registry = registry_tmp)
cl_str2id(corpus = "REUTERS", p_attribute = "word", str = "daily", registry = registry_tmp) # 365
cl_id2cpos(corpus = "REUTERS", p_attribute = "word", id = 365, registry = registry_tmp) # u.a. 2431
cl_str2id(corpus = "REUTERS", p_attribute = "word", str = "Al", registry = registry_tmp) # 892
cl_id2cpos(corpus = "REUTERS", p_attribute = "word", id = 892, registry = registry_tmp) # 2432
For the time being, the finding is that the result of cwb_huffcode()
and cwb_compress_rdx()
is binaries that provoke crashes. So compression is not recommended on Windows. We should include a respective note in the documentation and there should be a message on Windows by both functions.
Examples for
cwb_huffcode
are wrapped into a "dontrun" section at present, because the function did not pass checks on Windows and Solaris. Quite obviously, this is not the ambition I have to make the package fully portable.