Open e-hu opened 6 years ago
@e-hu Thank you for the feedback. Unfortunately, I don't have much time to modify the issue. Are you willing to send a PR?
This should work /*
package org.apache.spark.ml.feature
import scala.collection.JavaConverters. import org.atilika.kuromoji.{Token => KToken, Tokenizer => KTokenizer} import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions. import org.apache.spark.sql.types._
/**
Kuromoji is an open source Japanese morphological analyzer written in Java. */ @Experimental class KuromojiTokenizer(override val uid: String) extends Transformer with DefaultParamsWritable with KuromojiTokenizerParams {
// Sets the default values setDefault( mode -> "NORMAL", dictPath -> KuromojiTokenizer.DICT_PATH_NULL_VALUE )
def this() = this(Identifiable.randomUID("kuromojitok"))
/* @group setParam / def setInputCol(value: String): this.type = set(inputCol, value)
/* @group setParam / def setOutputCol(value: String): this.type = set(outputCol, value)
/* @group expertSetParam / def setMode(value: String): this.type = set(mode, value)
/* @group expertSetParam / def setDictPath(value: String): this.type = set(dictPath, value)
override def copy(extra: ParamMap): KuromojiTokenizer = defaultCopy(extra)
override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), new ArrayType(StringType, true), nullable = false) StructType(outputFields) }
override def transform(dataset: Dataset[_]): DataFrame = {
val tokenUDF = udf { text: String => val modeClass = KuromojiTokenizer.addressMode($(mode)) $(dictPath) match { case KuromojiTokenizer.DICT_PATH_NULLVALUE => CustomKuromojiTokenizer.tokenize(text, modeClass).map(.getSurfaceForm) case => CustomKuromojiTokenizer.tokenize(text, modeClass, $(dictPath)).map(.getSurfaceForm) } }
transformSchema(dataset.schema, logging = true) dataset.withColumn($(outputCol),tokenUDF( dataset($(inputCol))))
}
/ protected def createTransformFunc: String => Seq[String] = { val modeClass = KuromojiTokenizer.addressMode($(mode)) $(dictPath) match { case KuromojiTokenizer.DICT_PATH_NULLVALUE => CustomKuromojiTokenizer.tokenize(, modeClass).map(.getSurfaceForm) case => CustomKuromojiTokenizer.tokenize(, modeClass, $(dictPath)).map(.getSurfaceForm) } } /
protected def outputDataType: DataType = new ArrayType(StringType, true)
protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } }
/**
A Spark object to deal with Kuromoji tokenizer */ object KuromojiTokenizer extends DefaultParamsReadable[KuromojiTokenizer] {
private[feature] val DICT_PATH_NULL_VALUE = "NULL_VALUE"
override def load(path: String): KuromojiTokenizer = super.load(path)
private[feature] def addressMode(mode: String): KTokenizer.Mode = { mode match { case "NORMAL" => KTokenizer.Mode.NORMAL case "SEARCH" => KTokenizer.Mode.SEARCH case "EXTENDED" => KTokenizer.Mode.EXTENDED case _ => throw new IllegalArgumentException(s"${mode} is invalid. " + s"You should go with NORMAL, SEARCH or EXTENDED") } } }
/**
Parameter trait for KuromojiTokenizer
*/
private[feature]
trait KuromojiTokenizerParams extends Params with HasInputCol with HasOutputCol {
/**
/* @group expertGetParam / def getMode: String = $(mode)
/**
/* @group expertGetParam / def getDictionaryPath: String = $(dictPath) }
/**
TODO We can make it more efficient. Because this version build a tokenizer each times. */ private[feature] object CustomKuromojiTokenizer {
def tokenize(text: String, mode: KTokenizer.Mode): Seq[KToken] = { val tokenizer = KTokenizer.builder().mode(mode).build() //tokenizer.tokenize(text).asScala.dropWhile(.getSurfaceForm == ' ').toSeq tokenizer.tokenize(text).asScala.toSeq.filter(.getSurfaceForm.trim().nonEmpty)
}
def tokenize(text: String, mode: KTokenizer.Mode, dictPath: String): Seq[KToken] = { val tokenizer = KTokenizer.builder().mode(mode).userDictionary(dictPath).build() tokenizer.tokenize(text).asScala.toSeq.filter(_.getSurfaceForm.trim().nonEmpty) //tokenizer.tokenize(text).asScala.toSeq } }
is not support Spark 2.2 when create Tokenizer object