Closed ahomansikka closed 7 years ago
Yes, sounds good. A diff would be useful too.
It would be good to add (or extend existing) unit tests to make sure the offsets are correctly calculated. They should be consistent with Java string indexes: if you call text.substring(startOffset, endOffset)
of the original input string you should get the correct token. These offsets are NOT the same as those used in the internal C++ API.
Here is the diff with a test case.
diff --git a/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java b/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java index dca1c59..9670819 100644 --- a/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java +++ b/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java @@ -35,10 +35,14 @@ public class Token { private final TokenType type; private final String text; + private final int startOffset; + private final int endOffset; - public Token(TokenType type, String text) { + public Token(TokenType type, String text, int startOffset, int endOffset) { this.type = type; this.text = text; + this.startOffset = startOffset; + this.endOffset = endOffset; } /** @@ -55,4 +59,17 @@ public class Token { return text; } + /** + * @return start offset of token + */ + public int getStartOffset() { + return startOffset; + } + + /** + * @return end offset of token + */ + public int getEndOffset() { + return endOffset; + } } diff --git a/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java b/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java index 2078f7f..2f65540 100644 --- a/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java +++ b/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java @@ -317,11 +317,11 @@ public class Voikko { ListallTokens = new ArrayList (); int lastStart = 0; for (int i = indexOfSpecialUnknown(text, 0); i != -1; i = indexOfSpecialUnknown(text, i + 1)) { - allTokens.addAll(tokensNonNull(text.substring(lastStart, i))); - allTokens.add(new Token(TokenType.UNKNOWN, Character.toString(text.charAt(i)))); + allTokens.addAll(tokensNonNull(text.substring(lastStart, i), lastStart)); + allTokens.add(new Token(TokenType.UNKNOWN, Character.toString(text.charAt(i)), i, i + 1)); lastStart = i + 1; } - allTokens.addAll(tokensNonNull(text.substring(lastStart))); + allTokens.addAll(tokensNonNull(text.substring(lastStart),lastStart)); return allTokens; } @@ -336,7 +336,7 @@ public class Voikko { return -1; } - private List tokensNonNull(String text) { + private List tokensNonNull(String text, int lastStart) { Libvoikko lib = getLib(); List result = new ArrayList (); ByteBuffer textBytes = s2bb(text); @@ -350,7 +350,7 @@ public class Voikko { int tokenLen = tokenLenByRef.getValue().intValue(); TokenType tokenType = TokenType.values()[tokenTypeInt]; String tokenText = text.substring(textStart, textStart + tokenLen); - result.add(new Token(tokenType, tokenText)); + result.add(new Token(tokenType, tokenText, lastStart + textStart, lastStart + textStart + tokenLen)); textStart += tokenText.length(); int tokenBytes = s2n(tokenText).length - 1; bytesStart += tokenBytes; diff --git a/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java b/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java index d71537d..912e8b5 100644 --- a/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java +++ b/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java @@ -567,4 +567,12 @@ public class VoikkoTest { assertEquals("\uDC78", tokens.get(1).getText()); } + @Test + public void offsetsAreOK() { + final String s = "Ovatko myös ääkköset oikein?"; + List tokens = voikko.tokens(s); + for (Token t: tokens) { + assertEquals(t.getText(),s.substring(t.getStartOffset(),t.getEndOffset())); + } + } }
Thanks!
It would be easier to use libvoikko (Java version) in Solr (https://lucene.apache.org/solr/) if function Voikko.tokens(String) saved the start and end offsets of tokens it calculates. If you are interested I can send the diff.