Open minSW opened 3 years ago
한국어 형태소 분석기 Komoran : https://docs.komoran.kr/index.html
정형화되지 않은 커뮤니티 글 특성 상 형태소 분석기가 기대한만큼의 성능 x 일단은 다음과 같은 방법으로 활용
package com.sw.test import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.functions._ import org.apache.spark.ml.feature.{CountVectorizer, IDF, RegexTokenizer} import org.apache.spark.ml.linalg.SparseVector import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL import kr.co.shineware.nlp.komoran.core.Komoran import org.apache.spark.sql.functions.udf import org.apache.spark.sql.expressions.UserDefinedFunction import scala.collection.JavaConverters._ import java.io.File import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.{Vectors, Vector => MLLibVector} import org.apache.spark.ml.linalg.{Vector => MLVector} /** * @author 민세원 */ object ProjectApp { val outputPath: String = "/home/data/output/20210702" val numTerms = 10 // FIXME : LIGHT vs FULL val komoran = new Komoran(DEFAULT_MODEL.LIGHT) val getNounsUdf: UserDefinedFunction = udf[Seq[String], String] { sentence => komoran.analyze(sentence).getNouns.asScala } val tokenizer: RegexTokenizer = new RegexTokenizer().setInputCol("title").setOutputCol("tokens").setPattern("[ ]") def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("Project Application").getOrCreate() val outputDir = new File(outputPath) val fileList : List[File] = if (outputDir.exists && outputDir.isDirectory) outputDir.listFiles.filter(_.isFile).toList else List[File]() for (f <- fileList) { val df = spark.read .options(Map("header"->"true", "inferSchema"->"true")) .csv(f.getPath) .withColumn("gall_id", lit(f.getPath.split("/").last.split(".csv")(0))) val outputDF = tokenizer.transform(df) .withColumn("nouns", getNounsUdf(col("title"))) .select(col("gall_id"), col("title"), when(size(col("nouns")) > 0, col("nouns")).otherwise(col("tokens")).as("terms")) .filter(size(col("output")) > 0) //TF val model = new CountVectorizer().setInputCol("terms").setOutputCol("termsFreqs").setVocabSize(numTerms).fit(outputDF) val termIds: Array[String] = model.vocabulary val docTermsFreqs = model.transform(outputDF) docTermsFreqs.cache() //IDF val idfModel = new IDF().setInputCol("termsFreqs").setOutputCol("tfidfVec").fit(docTermsFreqs) val docTermMatrix = idfModel.transform(docTermsFreqs) val docRdd = docTermMatrix.select("tfidfVec") val bestwords = docRdd.flatMap { ele => var idx = -1 val ids = ele.getAs[SparseVector](0).toDense.toArray.map(value => { idx = idx + 1 (idx, value) }) ids.filter(v => {v._2 == 0.0}).map{v => termIds(v._1)} } bestwords.select(collect_list("value")).first().getList[String](0) // => {gall_id, words={}} // TODO: union } // // row ID - doc TITLE // //val docIds = docTermsFreqs.rdd.map(_.getString(0)).zipWithUniqueId().map(_.swap).collect().toMap // // // SVD // val vecRdd = docTermMatrix.select("tfidfVec").rdd.map{ row => Vectors.fromML(row.getAs[MLVector]("tfidfVec"))} // vecRdd.cache() // val mat = new RowMatrix(vecRdd) // val k = 1000 // val svd = mat.computeSVD(k, computeU=true) } }
1
2
3
react-wordcloud
Nivo + react-wordcloud 1차 2차 3차
Nivo + react-wordcloud
Spark + Scala + Komoran
정형화되지 않은 커뮤니티 글 특성 상 형태소 분석기가 기대한만큼의 성능 x 일단은 다음과 같은 방법으로 활용
Todo
Code