Closed asfimport closed 18 years ago
Che Dong (migrated from JIRA)
Created an attachment (id=8418) CJKTokenizer
Che Dong (migrated from JIRA)
Created an attachment (id=8419) CJKAnalyser: need remove empty token created by CJKTokenizer
Otis Gospodnetic (migrated from JIRA)
Thank you for the contribution, Che. I have finally added your 2 CJK classes to Lucene's Sandbox. I used the attached versions of your classes, not the inlined ones.
/* ====================================================================
package org.apache.lucene.analysis.cjk;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
/**
@author
Che, Dong */ public final class CJKTokenizer extends Tokenizer { //\~ Static fields/initializers ---------------------------------------------/** Max word length */ private static final int MAX_WORD_LEN = 255;
/** buffer size: */ private static final int IO_BUFFER_SIZE = 256;
//\~ Instance fields --------------------------------------------------------
/** word offset, used to imply which character(in ) is parsed */ private int offset = 0;
/** the index used only for ioBuffer */ private int bufferIndex = 0;
/** data length */ private int dataLen = 0;
/**
the returned Token */ private final char[] buffer = new char[MAX_WORD_LEN];
/**
members of Tokenizer) */ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** word type: single=>ASCII double=>non-ASCII word=>default */ private String tokenType = "word";
/**
(set the C1 isTokened) C1C2 "C2C3C4" ---(set the C2 isTokened)C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" */ private boolean preIsTokened = false;
//\~ Constructors -----------------------------------------------------------
/**
@param
in I/O reader */ public CJKTokenizer(Reader in) { input = in; }//\~ Methods ----------------------------------------------------------------
/**
@return
Token *@throws
java.io.IOException - throw IOException when read error <br>@see
"http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.htm l"for detail */ public final Token next() throws java.io.IOException { /** how many character(s) has been stored in buffer */ int length = 0;
Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) ) { if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ int i = (int) c; i = i - 65248; c = (char) i; }
ASCII // letter start = offset - 1; } else if (tokenType == "double") { // "javaC1C2C3C4linux" <br> // ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; tokenType = "single";
} }
/* ====================================================================
package org.apache.lucene.analysis.cjk;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream;
import java.io.Reader;
import java.util.Hashtable;
/**
@author
Che, Dong */ public class CJKAnalyzer extends Analyzer { //\~ Static fields/initializers ---------------------------------------------/**
useful for searching. and some double-byte interpunctions..... */ private static String[] stopWords = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "", "www" };
//\~ Instance fields --------------------------------------------------------
/** stop word list */ private Hashtable stopTable;
//\~ Constructors -----------------------------------------------------------
/**
Builds an analyzer which removes words in STOP_WORDS. */ public CJKAnalyzer() { stopTable = StopFilter.makeStopTable(stopWords); }
/**
@param
stopWords stop word array */ public CJKAnalyzer(String[] stopWords) { stopTable = StopFilter.makeStopTable(stopWords); }//\~ Methods ----------------------------------------------------------------
/**
@param
fieldName lucene field name@param
reader input reader *@return
TokenStream */ public final TokenStream tokenStream(String fieldName, Reader reader) { return new StopFilter(new CJKTokenizer(reader), stopTable); } }Migrated from LUCENE-139 by Che Dong, resolved May 27 2006 Environment:
Attachments: ASF.LICENSE.NOT.GRANTED--CJKAnalyzer.java, ASF.LICENSE.NOT.GRANTED--CJKTokenizer.java