voikko / corevoikko

Libvoikko and essential linguistic resources
Other
89 stars 25 forks source link

libvoikko Java: Save start and end offsets of tokens #27

Closed ahomansikka closed 7 years ago

ahomansikka commented 7 years ago

It would be easier to use libvoikko (Java version) in Solr (https://lucene.apache.org/solr/) if function Voikko.tokens(String) saved the start and end offsets of tokens it calculates. If you are interested I can send the diff.

hatapitk commented 7 years ago

Yes, sounds good. A diff would be useful too.

It would be good to add (or extend existing) unit tests to make sure the offsets are correctly calculated. They should be consistent with Java string indexes: if you call text.substring(startOffset, endOffset) of the original input string you should get the correct token. These offsets are NOT the same as those used in the internal C++ API.

ahomansikka commented 7 years ago

Here is the diff with a test case.

diff --git a/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java b/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java
index dca1c59..9670819 100644
--- a/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java
+++ b/libvoikko/java/src/main/java/org/puimula/libvoikko/Token.java
@@ -35,10 +35,14 @@ public class Token {

     private final TokenType type;
     private final String text;
+    private final int startOffset;
+    private final int endOffset;

-    public Token(TokenType type, String text) {
+    public Token(TokenType type, String text, int startOffset, int endOffset) {
         this.type = type;
         this.text = text;
+        this.startOffset = startOffset;
+        this.endOffset = endOffset;
     }

     /**
@@ -55,4 +59,17 @@ public class Token {
         return text;
     }

+    /**
+     * @return start offset of token
+     */
+    public int getStartOffset() {
+        return startOffset;
+    }
+
+    /**
+     * @return end offset of token
+     */
+    public int getEndOffset() {
+        return endOffset;
+    }
 }
diff --git a/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java b/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java
index 2078f7f..2f65540 100644
--- a/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java
+++ b/libvoikko/java/src/main/java/org/puimula/libvoikko/Voikko.java
@@ -317,11 +317,11 @@ public class Voikko {
         List allTokens = new ArrayList();
         int lastStart = 0;
         for (int i = indexOfSpecialUnknown(text, 0); i != -1; i = indexOfSpecialUnknown(text, i + 1)) {
-            allTokens.addAll(tokensNonNull(text.substring(lastStart, i)));
-            allTokens.add(new Token(TokenType.UNKNOWN, Character.toString(text.charAt(i))));
+            allTokens.addAll(tokensNonNull(text.substring(lastStart, i), lastStart));
+            allTokens.add(new Token(TokenType.UNKNOWN, Character.toString(text.charAt(i)), i, i + 1));
             lastStart = i + 1;
         }
-        allTokens.addAll(tokensNonNull(text.substring(lastStart)));
+        allTokens.addAll(tokensNonNull(text.substring(lastStart),lastStart));
         return allTokens;
     }

@@ -336,7 +336,7 @@ public class Voikko {
         return -1;
     }

-    private List tokensNonNull(String text) {
+    private List tokensNonNull(String text, int lastStart) {
         Libvoikko lib = getLib();
         List result = new ArrayList();
         ByteBuffer textBytes = s2bb(text);
@@ -350,7 +350,7 @@ public class Voikko {
             int tokenLen = tokenLenByRef.getValue().intValue();
             TokenType tokenType = TokenType.values()[tokenTypeInt];
             String tokenText = text.substring(textStart, textStart + tokenLen);
-            result.add(new Token(tokenType, tokenText));
+            result.add(new Token(tokenType, tokenText, lastStart + textStart, lastStart + textStart + tokenLen));
             textStart += tokenText.length();
             int tokenBytes = s2n(tokenText).length - 1;
             bytesStart += tokenBytes;
diff --git a/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java b/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java
index d71537d..912e8b5 100644
--- a/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java
+++ b/libvoikko/java/src/test/java/org/puimula/libvoikko/VoikkoTest.java
@@ -567,4 +567,12 @@ public class VoikkoTest {
         assertEquals("\uDC78", tokens.get(1).getText());
     }

+    @Test
+    public void offsetsAreOK() {
+        final String s = "Ovatko myös ääkköset oikein?";
+        List tokens = voikko.tokens(s);
+        for (Token t: tokens) {
+             assertEquals(t.getText(),s.substring(t.getStartOffset(),t.getEndOffset()));
+        }
+    }
 }
hatapitk commented 7 years ago

Thanks!