Open gagolews opened 7 years ago
I will add support for custom rule-based break iterators in stringi https://github.com/gagolews/stringi/issues/263
For custom dictionary-based break iterators (something that will enable to handle Korean better), I'm afraid you'll have to contribute to the ICU4C core lib directly - but more users will benefit from this :)
example use case for rule-based break iteration http://sujitpal.blogspot.co.uk/2008/05/tokenizing-text-with-icu4js.html -- rules can be specified for handling URLs nicely, among others
DONE:
> # example from http://sujitpal.blogspot.co.uk/2008/05/tokenizing-text-with-icu4js.html
> rules <- "
+ !!chain;
+ $VoiceMarks = [\\uff9e\\uff9f];
+ $Format = [\\p{Word_Break = Format}];
+ $Katakana = [\\p{Word_Break = Katakana}-$VoiceMarks];
+ $ALetter = [\\p{Word_Break = ALetter}];
+ $MidLetter = [\\p{Word_Break = MidLetter}];
+ $MidNum = [\\p{Word_Break = MidNum}];
+ $Numeric = [\\p{Word_Break = Numeric}];
+ $ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];
+ $CR = \\u000d;
+ $LF = \\u000a;
+ $Extend = [\\p{Grapheme_Cluster_Break = Extend}$VoiceMarks];
+ $Control = [\\p{Grapheme_Cluster_Break = Control}];
+ $dictionary = [:LineBreak = Complex_Context:];
+ $ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
+ $KatakanaEx = $Katakana ($Extend | $Format)*;
+ $ALetterEx = $ALetterPlus ($Extend | $Format)*;
+ $MidLetterEx = $MidLetter ($Extend | $Format)*;
+ $MidNumEx = $MidNum ($Extend | $Format)*;
+ $NumericEx = $Numeric ($Extend | $Format)*;
+ $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
+ $Hiragana = [:Hiragana:];
+ $Ideographic = [:IDEOGRAPHIC:];
+ $HiraganaEx = $Hiragana ($Extend | $Format)*;
+ $IdeographicEx = $Ideographic ($Extend | $Format)*;
+ # ============= Custom Rules ================
+ # Abbreviation: Uppercase alpha chars separated by period and optionally followed by a period
+ $Abbreviation = [A-Z0-9](\\.[A-Z0-9])+(\\.)*;
+ # Hyphenated Word : sequence of letter or digit, (punctuated by - or _, with following letter or digit sequence)+
+ $HyphenatedWord = [A-Za-z0-9]+([\\-_][A-Za-z0-9]+)+;
+ # Email address: sequence of letters, digits and punctuation followed by @ and followed by another sequence
+ $EmailAddress = [A-Za-z0-9_\\-\\.]+\\@[A-Za-z][A-Za-z0-9_]+\\.[a-z]+;
+ # Internet Addresses: http://www.foo.com(/bar)
+ $InternetAddress = [a-z]+\\:\\/\\/[a-z0-9]+(\\.[a-z0-9]+)+(\\/[a-z0-9][a-z0-9\\.]+);
+ # XML markup: A run begins with < and ends with the first matching >
+ $XmlMarkup = \\<[^\\>]+\\>;
+ # Emoticon: A run that starts with :;B8{[ and contains only one or more of the following -=/{})(
+ $Emoticon = [B8\\:\\;\\{\\[][-=\\/\\{\\}\\)\\(]+;
+
+ !!forward;
+ $CR $LF ($Extend | $Format)*;
+ .? ($Extend | $Format)+;
+ $NumericEx {100};
+ $ALetterEx {200};
+ $KatakanaEx {300};
+ $HiraganaEx {300};
+ $IdeographicEx {400};
+ $ALetterEx $ALetterEx {200};
+ $ALetterEx $MidLetterEx $ALetterEx {200};
+ $NumericEx $NumericEx {100};
+ $ALetterEx $Format* $NumericEx {200};
+ $NumericEx $ALetterEx {200};
+ $NumericEx $MidNumEx $NumericEx {100};
+ $KatakanaEx $KatakanaEx {300};
+ $ALetterEx $ExtendNumLetEx {200};
+ $NumericEx $ExtendNumLetEx {100};
+ $KatakanaEx $ExtendNumLetEx {300};
+ $ExtendNumLetEx $ExtendNumLetEx{200};
+ $ExtendNumLetEx $ALetterEx {200};
+ $ExtendNumLetEx $NumericEx {100};
+ $ExtendNumLetEx $KatakanaEx {300};
+ # Custom : Abbreviation
+ $Abbreviation {500};
+ $HyphenatedWord {501};
+ $EmailAddress {502};
+ $InternetAddress {503};
+ $XmlMarkup {504};
+ $Emoticon {505};
+
+ !!reverse;
+ $BackALetterEx = ($Format | $Extend)* $ALetterPlus;
+ $BackNumericEx = ($Format | $Extend)* $Numeric;
+ $BackMidNumEx = ($Format | $Extend)* $MidNum;
+ $BackMidLetterEx = ($Format | $Extend)* $MidLetter;
+ $BackKatakanaEx = ($Format | $Extend)* $Katakana;
+ $BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;
+ ($Format | $Extend)* $LF $CR;
+ ($Format | $Extend)* .?;
+ $BackALetterEx $BackALetterEx;
+ $BackALetterEx $BackMidLetterEx $BackALetterEx;
+ $BackNumericEx $BackNumericEx;
+ $BackNumericEx $BackALetterEx;
+ $BackALetterEx $BackNumericEx;
+ $BackNumericEx $BackMidNumEx $BackNumericEx;
+ $BackKatakanaEx $BackKatakanaEx;
+ ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $BackExtendNumLetEx;
+ $BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
+
+ !!safe_reverse;
+ ($Extend | $Format)+ .?;
+ $MidLetter $BackALetterEx;
+ $MidNum $BackNumericEx;
+ $dictionary $dictionary;
+
+ !!safe_forward;
+ ($Extend | $Format)+ .?;
+ $MidLetterEx $ALetterEx;
+ $MidNumEx $NumericEx;
+ $dictionary $dictionary;
+ "
>
> expect_identical(
+ stri_extract_all_boundaries("test1 test2\ntest3\ttest4", skip_word_none = TRUE, type="word"),
+ stri_extract_all_boundaries("test1 test2\ntest3\ttest4", skip_word_none = TRUE, type=rules)
+ )
>
> x <- "
+ Jaguar will sell its new XJ-6 model in the U.S. for a small fortune :-).
+ Expect to pay around USD 120ks. Custom options can set you back another
+ few 10,000 dollars. For details, go to <a href=\"http://www.jaguar.com/sales\"
+ alt=\"Click here\">Jaguar Sales</a> or contact xj-6@jaguar.com.
+ "
> stri_extract_all_boundaries(x, skip_word_none = TRUE, type="word")
[[1]]
[1] "Jaguar" "will" "sell" "its" "new"
[6] "XJ" "6" "model" "in" "the"
[11] "U.S" "for" "a" "small" "fortune"
[16] "Expect" "to" "pay" "around" "USD"
[21] "120ks" "Custom" "options" "can" "set"
[26] "you" "back" "another" "few" "10,000"
[31] "dollars" "For" "details" "go" "to"
[36] "a" "href" "http" "www.jaguar.com" "sales"
[41] "alt" "Click" "here" "Jaguar" "Sales"
[46] "a" "or" "contact" "xj" "6"
[51] "jaguar.com"
> stri_extract_all_boundaries(x, skip_word_none = TRUE, type=rules)
[[1]]
[1] "Jaguar"
[2] "will"
[3] "sell"
[4] "its"
[5] "new"
[6] "XJ-6"
[7] "model"
[8] "in"
[9] "the"
[10] "U.S."
[11] "for"
[12] "a"
[13] "small"
[14] "fortune"
[15] ":-)"
[16] "Expect"
[17] "to"
[18] "pay"
[19] "around"
[20] "USD"
[21] "120ks"
[22] "Custom"
[23] "options"
[24] "can"
[25] "set"
[26] "you"
[27] "back"
[28] "another"
[29] "few"
[30] "10,000"
[31] "dollars"
[32] "For"
[33] "details"
[34] "go"
[35] "to"
[36] "<a href=\"http://www.jaguar.com/sales\"\nalt=\"Click here\">"
[37] "Jaguar"
[38] "Sales"
[39] "</a>"
[40] "or"
[41] "contact"
[42] "xj-6@jaguar.com"
Korean is not supporting well on ICU compared with KoNLP. It needs contribute.
국민은
-> 국민
(Noun) + 은
(post-position)
은
can be vary on different sentences. Tokenizer in Korean means dropping post-position
correctly.
Simple method based on dictionary can be
> library(KoNLP)
> library(stringi)
>
> txt <- "국민은 일회용 컵과 비닐의 사용을 억제하고 산업계는
+ 제품 생산 단계부터 순환 가치를 고려하는 생산 구조로 전환하는 방안도 제시했다."
>
> stri_extract_all_boundaries(txt, skip_word_none = TRUE, type="word")
[[1]]
[1] "국민은" "일회용" "컵과" "비닐의" "사용을" "억제하고" "산업계는" "제품"
[9] "생산" "단계부터" "순환" "가치를" "고려하는" "생산" "구조로" "전환하는"
[17] "방안도" "제시했다"
>
> extractNoun(txt)
[1] "국민" "일회용" "컵" "비닐" "사용" "억제" "산업계" "제품" "생산" "단계"
[11] "순환" "가치" "고려" "하" "생산" "구조" "전환" "하" "방안" "제시"
ICU allows for specifying arbitrary boundary rules based on a regex-like syntax. Moreover, it supports dictionary-based break iteration with dictionaries specified by users. I haven't created any interface to that in
stringi
yet.. Should I do it? Are there any interesting use cases for that?References: http://userguide.icu-project.org/boundaryanalysis http://www.unicode.org/reports/tr29/