[PATCH] Bigram based CJK tokenizer(modified from StopTokenizer) [LUCENE-139]

/* ====================================================================

The Apache Software License, Version 1.1 *
Copyright (c) 2001 The Apache Software Foundation. All rights
reserved. *
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met: *
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. *
1. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution. *
1. The end-user documentation included with the redistribution,
if any, must include the following acknowledgment:
"This product includes software developed by the
Apache Software Foundation (http://www.apache.org/)."
Alternately, this acknowledgment may appear in the software itself,
if and wherever such third-party acknowledgments normally appear. *
1. The names "Apache" and "Apache Software Foundation" and
"Apache Lucene" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact apache@apache.org. *
1. Products derived from this software may not be called "Apache",
"Apache Lucene", nor may "Apache" appear in their name, without
prior written permission of the Apache Software Foundation. *
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
==================================================================== *
This software consists of voluntary contributions made by many
individuals on behalf of the Apache Software Foundation. For more
information on the Apache Software Foundation, please see
<http://www.apache.org/>. *
$Id: CJKTokenizer.java,v 1.7 2003/05/26 07:14:03 chedong Exp $ */

package org.apache.lucene.analysis.cjk;

import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer;

import java.io.Reader;

/**

CJKTokenizer was modified from StopTokenizer which does a decent job for
most European languages. and it perferm other token method for double-byte
Characters: the token will return at each two charactors with overlap match.
Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
also need filter filter zero length token ""
for Digit: digit, '+', '#' will token as letter
for more info on Asia language(Chinese Japanese Korean) text segmentation:
please search <a
href="http://www.google.com/search?q=word+chinese+segment">google</a>
*
@author Che, Dong */ public final class CJKTokenizer extends Tokenizer { //\~ Static fields/initializers ---------------------------------------------

/** Max word length */ private static final int MAX_WORD_LEN = 255;

/** buffer size: */ private static final int IO_BUFFER_SIZE = 256;

//\~ Instance fields --------------------------------------------------------

/** word offset, used to imply which character(in ) is parsed */ private int offset = 0;

/** the index used only for ioBuffer */ private int bufferIndex = 0;

/** data length */ private int dataLen = 0;

/**
character buffer, store the characters which are used to compose
the returned Token */ private final char[] buffer = new char[MAX_WORD_LEN];

/**
I/O buffer, used to store the content of the input(one of the
members of Tokenizer) */ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

/** word type: single=>ASCII double=>non-ASCII word=>default */ private String tokenType = "word";

/**
tag: previous character is a cached double-byte character "C1C2C3C4"
---~~(set the C1 isTokened) C1C2 "C2C3C4" -~~--(set the C2 isTokened)
C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" */ private boolean preIsTokened = false;

//\~ Constructors -----------------------------------------------------------

/**
Construct a token stream processing the given input. *
@param in I/O reader */ public CJKTokenizer(Reader in) { input = in; }

//\~ Methods ----------------------------------------------------------------

/**
Returns the next token in the stream, or null at EOS. *
@return Token *
@throws java.io.IOException - throw IOException when read error
hanppened in the InputStream *
@see "http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.htm l"

for detail */ public final Token next() throws java.io.IOException { /** how many character(s) has been stored in buffer */ int length = 0;

/\*\* the position used to create Token \*/
int start = offset;

while (true) {
    /\*\* current charactor \*/
    char c;

    /\*\* unicode block of current charactor for detail \*/
    Character.UnicodeBlock ub;

    offset++;

    if (bufferIndex >= dataLen) {
        dataLen = input.read(ioBuffer);
        bufferIndex = 0;
    }

    if (dataLen == -1) {
        if (length > 0) {
            if (preIsTokened == true) {
                length = 0;
                preIsTokened = false;
            }

            break;
        } else {
            return null;
        }
    } else {
        //get current character
        c = (char) ioBuffer[bufferIndex++];

        //get the UnicodeBlock of the current character
        ub = Character.UnicodeBlock.of(c);
    }

    //if the current character is ASCII or Extend ASCII
    if ((ub == Character.UnicodeBlock.BASIC_LATIN)
            || (ub ==

Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) ) { if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ int i = (int) c; i = i - 65248; c = (char) i; }

        // if the current character is a letter or "_" "+" "#"
        if (Character.isLetterOrDigit(c)
                || ((c == '_') || (c == '+') || (c == '#'))
           ) {
            if (length == 0) {
                // "javaC1C2C3C4linux" &lt;br&gt;
                //      ^--: the current character begin to token the

ASCII // letter start = offset - 1; } else if (tokenType == "double") { // "javaC1C2C3C4linux" // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; tokenType = "single";

                if (preIsTokened == true) {
                    // there is only one non-ASCII has been stored
                    length = 0;
                    preIsTokened = false;

                    break;
                } else {
                    break;
                }
            }

            // store the LowerCase(c) in the buffer
            buffer[length++] = Character.toLowerCase(c);
            tokenType = "single";

            // break the procedure if buffer overflowed!
            if (length == MAX_WORD_LEN) {
                break;
            }
        } else if (length > 0) {
            if (preIsTokened == true) {
                length = 0;
                preIsTokened = false;
            } else {
                break;
            }
        }
    } else {
        // non-ASCII letter, eg."C1C2C3C4"
        if (Character.isLetter(c)) {
            if (length == 0) {
                start = offset - 1;
                buffer[length++] = c;
                tokenType = "double";
            } else {
                if (tokenType == "single") {
                    offset--;
                    bufferIndex--;

                    //return the previous ASCII characters
                    break;
                } else {
                    buffer[length++] = c;
                    tokenType = "double";

                    if (length == 2) {
                        offset--;
                        bufferIndex--;
                        preIsTokened = true;

                        break;
                    }
                }
            }
        } else if (length > 0) {
            if (preIsTokened == true) {
                // empty the buffer
                length = 0;
                preIsTokened = false;
            } else {
                break;
            }
        }
    }
}

return new Token(new String(buffer, 0, length), start, start + length,
                 tokenType
                );

} }

/* ====================================================================

The Apache Software License, Version 1.1 *
Copyright (c) 2001 The Apache Software Foundation. All rights
reserved. *
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met: *
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer. *
1. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution. *
1. The end-user documentation included with the redistribution,
if any, must include the following acknowledgment:
"This product includes software developed by the
Apache Software Foundation (http://www.apache.org/)."
Alternately, this acknowledgment may appear in the software itself,
if and wherever such third-party acknowledgments normally appear. *
1. The names "Apache" and "Apache Software Foundation" and
"Apache Lucene" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact apache@apache.org. *
1. Products derived from this software may not be called "Apache",
"Apache Lucene", nor may "Apache" appear in their name, without
prior written permission of the Apache Software Foundation. *
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
==================================================================== *
This software consists of voluntary contributions made by many
individuals on behalf of the Apache Software Foundation. For more
information on the Apache Software Foundation, please see
<http://www.apache.org/>. *
$Id: jalopy.xml,v 1.1 2003/04/30 14:36:56 chedong Exp $ */

package org.apache.lucene.analysis.cjk;

import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream;

import java.io.Reader;

import java.util.Hashtable;

/**

Filters CJKTokenizer with StopFilter. *
@author Che, Dong */ public class CJKAnalyzer extends Analyzer { //\~ Static fields/initializers ---------------------------------------------

/**
An array containing some common English words that are not usually
useful for searching. and some double-byte interpunctions..... */ private static String[] stopWords = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "", "www" };

//\~ Instance fields --------------------------------------------------------

/** stop word list */ private Hashtable stopTable;

//\~ Constructors -----------------------------------------------------------

/**
Builds an analyzer which removes words in STOP_WORDS. */ public CJKAnalyzer() { stopTable = StopFilter.makeStopTable(stopWords); }

/**
Builds an analyzer which removes words in the provided array. *
@param stopWords stop word array */ public CJKAnalyzer(String[] stopWords) { stopTable = StopFilter.makeStopTable(stopWords); }

//\~ Methods ----------------------------------------------------------------

/**
get token stream from input *
@param fieldName lucene field name
@param reader input reader *
@return TokenStream */ public final TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(new CJKTokenizer(reader), stopTable); } }

Migrated from LUCENE-139 by Che Dong, resolved May 27 2006 Environment:

Operating System: All
Platform: All

Attachments: ASF.LICENSE.NOT.GRANTED--CJKAnalyzer.java, ASF.LICENSE.NOT.GRANTED--CJKTokenizer.java

apache / lucene

[PATCH] Bigram based CJK tokenizer(modified from StopTokenizer) [LUCENE-139] #1217