apache / lucene

Apache Lucene open-source search software
https://lucene.apache.org/
Apache License 2.0
2.67k stars 1.03k forks source link

Make CJKAnalyzer that use Trigram instead of Bigram #11853

Open YmBIgo opened 2 years ago

YmBIgo commented 2 years ago

Description

Hi. I want to make Trigram Filter for CJK Analyzer since I am Japanese. And I have written some simple code to achieve it. It is below inspired by original CJKBigramFilter.java file. It is work fine at my environment.

But I still cannot understand how I should do conditional branch for flushUnigram ( and if necessary flushBigram) instead of add-on method that flush trigram flushTrigram.

Is there any idea that teach me how and why I should flushUnigram at if (offsetAtt.startOffset() != lastEndOffset) case3.

Thank you.

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.cjk;

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;

/**
 * Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer.
 *
 * <p>CJK types are set by these tokenizers, but you can also use {@link
 * #CJKBigramFilter(TokenStream, int)} to explicitly control which of the CJK scripts are turned
 * into bigrams.
 *
 * <p>By default, when a CJK character has no adjacent characters to form a bigram, it is output in
 * unigram form. If you want to always output both unigrams and bigrams, set the <code>
 * outputUnigrams</code> flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
 * This can be used for a combined unigram+bigram approach.
 *
 * <p>Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries. Korean Hangul
 * characters are treated the same as many other scripts' letters, and as a result,
 * StandardTokenizer can produce tokens that mix Hangul and non-Hangul characters, e.g. "한국abc".
 * Such mixed-script tokens are typed as <code>&lt;ALPHANUM&gt;</code> rather than <code>
 * &lt;HANGUL&gt;</code>, and as a result, will not be converted to bigrams by CJKBigramFilter.
 *
 * <p>In all cases, all non-CJK input is passed thru unmodified.
 */
public final class CJKTrigramFilter extends TokenFilter {
  // configuration
  /** bigram flag for Han Ideographs */
  public static final int HAN = 1;
  /** bigram flag for Hiragana */
  public static final int HIRAGANA = 2;
  /** bigram flag for Katakana */
  public static final int KATAKANA = 4;
  /** bigram flag for Hangul */
  public static final int HANGUL = 8;

  /** when we emit a bigram, it's then marked as this type */
  public static final String DOUBLE_TYPE = "<DOUBLE>";
  /** when we emit a unigram, it's then marked as this type */
  public static final String SINGLE_TYPE = "<SINGLE>";
  /*  */
+  public static final String TRIPLE_TYPE = "<TRIPLE>";

  // the types from standardtokenizer
  private static final String HAN_TYPE =
      StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
  private static final String HIRAGANA_TYPE =
      StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
  private static final String KATAKANA_TYPE =
      StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
  private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];

  // sentinel value for ignoring a script
  private static final Object NO = new Object();

  // these are set to either their type or NO if we want to pass them thru
  private final Object doHan;
  private final Object doHiragana;
  private final Object doKatakana;
  private final Object doHangul;

  // true if we should output unigram tokens always
  private final boolean outputUnigrams;
  private boolean ngramState; // false = output unigram, true = output bigram

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt =
      addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);

  // buffers containing codepoint and offsets in parallel
+   int[] buffer = new int[12];
+   int[] startOffset = new int[12];
+   int[] endOffset = new int[12];
  // length of valid buffer
  int bufferLen;
  // current buffer index
  int index;

  // the last end offset, to determine if we should bigram across tokens
  int lastEndOffset;

  private boolean exhausted;

  /**
   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) CJKBigramFilter(in, HAN |
   * HIRAGANA | KATAKANA | HANGUL)}
   */
  public CJKTrigramFilter(TokenStream in) {
    this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
  }

  /**
   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) CJKBigramFilter(in,
   * flags, false)}
   */
  public CJKTrigramFilter(TokenStream in, int flags) {
    this(in, flags, false);
  }

  /**
   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, and whether
   * or not unigrams should also be output.
   *
   * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
   *     {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
   * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
   *     when this is false, this is only done when there are no adjacent characters to form a
   *     bigram.
   */
  public CJKTrigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
    super(in);
    doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
    doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
    doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
    doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
    this.outputUnigrams = outputUnigrams;
  }

  /*
   * much of this complexity revolves around handling the special case of a
   * "lone cjk character" where cjktokenizer would output a unigram. this
   * is also the only time we ever have to captureState.
   */
  @Override
  public boolean incrementToken() throws IOException {
    while (true) {
+    if (hasBufferedTrigram()) {

        // case 1: we have multiple remaining codepoints buffered,
        // so we can emit a bigram here.

        if (outputUnigrams) {

          // when also outputting unigrams, we output the unigram first,
          // then rewind back to revisit the bigram.
          // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
          // the logic in hasBufferedUnigram ensures we output the C,
          // even though it did actually have adjacent CJK characters.

          if (ngramState) {
+            flushTrigram();
          } else {
            flushUnigram();
            index--;
          }
          ngramState = !ngramState;
        } else {
+          flushTrigram();
        }
        return true;
      } else if (doNext()) {

        // case 2: look at the token type. should we form any n-grams?

        String type = typeAtt.type();
        if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {

          // acceptable CJK type: we form n-grams from these.
          // as long as the offsets are aligned, we just add these to our current buffer.
          // otherwise, we clear the buffer and start over.

          if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
            if (hasBufferedUnigram()) {

              // we have a buffered unigram, and we peeked ahead to see if we could form
              // a bigram, but we can't, because the offsets are unaligned. capture the state
              // of this peeked data to be revisited next time thru the loop, and dump our unigram.

              loneState = captureState();
              flushUnigram();
              return true;
            }
            index = 0;
            bufferLen = 0;
          }
          refill();
        } else {

          // not a CJK type: we just return these as-is.

          if (hasBufferedUnigram()) {

            // we have a buffered unigram, and we peeked ahead to see if we could form
            // a bigram, but we can't, because it's not a CJK type. capture the state
            // of this peeked data to be revisited next time thru the loop, and dump our unigram.

            loneState = captureState();
            flushUnigram();
            return true;
          }
          return true;
        }
      } else {

        // case 3: we have only zero or 1 codepoints buffered,
        // so not enough to form a bigram. But, we also have no
        // more input. So if we have a buffered codepoint, emit
        // a unigram, otherwise, it's end of stream.

        if (hasBufferedUnigram()) {
          flushUnigram(); // flush our remaining unigram
          return true;
        }
+        if (hasBufferedBigram()) {
+          flushBigram();
+          return true;
+        }
        return false;
      }
    }
  }

  private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams

  /** looks at next input token, returning false is none is available */
  private boolean doNext() throws IOException {
    if (loneState != null) {
      restoreState(loneState);
      loneState = null;
      return true;
    } else {
      if (exhausted) {
        return false;
      } else if (input.incrementToken()) {
        return true;
      } else {
        exhausted = true;
        return false;
      }
    }
  }

  /** refills buffers with new data from the current token. */
  private void refill() {
    // compact buffers to keep them smallish if they become large
    // just a safety check, but technically we only need the last codepoint
    if (bufferLen > 64) {
      int last = bufferLen - 1;
      buffer[0] = buffer[last];
      startOffset[0] = startOffset[last];
      endOffset[0] = endOffset[last];
      bufferLen = 1;
      index -= last;
    }

    char[] termBuffer = termAtt.buffer();
    int len = termAtt.length();
    int start = offsetAtt.startOffset();
    int end = offsetAtt.endOffset();

    int newSize = bufferLen + len;
    buffer = ArrayUtil.grow(buffer, newSize);
    startOffset = ArrayUtil.grow(startOffset, newSize);
    endOffset = ArrayUtil.grow(endOffset, newSize);
    lastEndOffset = end;

    if (end - start != len) {
      // crazy offsets (modified by synonym or charfilter): just preserve
      for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
        startOffset[bufferLen] = start;
        endOffset[bufferLen] = end;
        bufferLen++;
      }
    } else {
      // normal offsets
      for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
        cpLen = Character.charCount(cp);
        startOffset[bufferLen] = start;
        start = endOffset[bufferLen] = start + cpLen;
        bufferLen++;
      }
    }
  }

+  /**
+    Flushes a trigram token 
+  */ 
+  private void flushTrigram() {
+    clearAttributes();
+    char[] termBuffer = termAtt.resizeBuffer(6);
+    int len1 = Character.toChars(buffer[index], termBuffer, 0);
+    int len2 = len1 + Character.toChars(buffer[index + 1], termBuffer, len1);
+    int len3 = len2 + Character.toChars(buffer[index + 2], termBuffer, len2);
+    termAtt.setLength(len3);
+    offsetAtt.setOffset(startOffset[index], endOffset[index+2]);
+    typeAtt.setType(TRIPLE_TYPE);
+    if (outputUnigrams) {
+      posIncAtt.setPositionIncrement(0);
+      posLengthAtt.setPositionLength(2);
+    }
+    index++;
+  }

  /**
   * Flushes a bigram token to output from our buffer This is the normal case, e.g. ABC -&gt; AB BC
   */
  private void flushBigram() {
    clearAttributes();
    char[] termBuffer =
        termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
    int len1 = Character.toChars(buffer[index], termBuffer, 0);
    int len2 = len1 + Character.toChars(buffer[index + 1], termBuffer, len1);
    termAtt.setLength(len2);
    offsetAtt.setOffset(startOffset[index], endOffset[index + 1]);
    typeAtt.setType(DOUBLE_TYPE);
    // when outputting unigrams, all bigrams are synonyms that span two unigrams
    if (outputUnigrams) {
      posIncAtt.setPositionIncrement(0);
      posLengthAtt.setPositionLength(2);
    }
    index++;
  }

  /**
   * Flushes a unigram token to output from our buffer. This happens when we encounter isolated CJK
   * characters, either the whole CJK string is a single character, or we encounter a CJK character
   * surrounded by space, punctuation, english, etc, but not beside any other CJK.
   */
  private void flushUnigram() {
    clearAttributes();
    char[] termBuffer = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
    int len = Character.toChars(buffer[index], termBuffer, 0);
    termAtt.setLength(len);
    offsetAtt.setOffset(startOffset[index], endOffset[index]);
    typeAtt.setType(SINGLE_TYPE);
    index++;
  }

+  /* */
+  private boolean hasBufferedTrigram() {
+    return bufferLen - index > 2;
+  }

  /** True if we have multiple codepoints sitting in our buffer */
  private boolean hasBufferedBigram() {
    return bufferLen - index > 1;
  }

  /**
   * True if we have a single codepoint sitting in our buffer, where its future (whether it is
   * emitted as unigram or forms a bigram) depends upon not-yet-seen inputs.
   */
  private boolean hasBufferedUnigram() {
    if (outputUnigrams) {
      // when outputting unigrams always
      return bufferLen - index == 1;
    } else {
      // otherwise it's only when we have a lone CJK character
      return bufferLen == 1 && index == 0;
    }
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    bufferLen = 0;
    index = 0;
    lastEndOffset = 0;
    loneState = null;
    exhausted = false;
    ngramState = false;
  }
}
rmuir commented 2 years ago

If you want to do things like trigrams, just use n-gram tokenizer instead...