Spark + Scala + Komoran

한국어 형태소 분석기 Komoran : https://docs.komoran.kr/index.html

정형화되지 않은 커뮤니티 글 특성 상 형태소 분석기가 기대한만큼의 성능 x 일단은 다음과 같은 방법으로 활용

형태소 분석을 사용해 명사 (nouns) 만 추출
추출된 명사가 없는 경우 단순 tokenizer (스페이스 구분) 사용

Todo

[x] 1단계 - 데이터 파싱 + 단어 빈도수 구하기
[x] 2단계 - tf-idf
[x] 3단계 - 숨은 의미 분석
[x] 각 갤러리 결과 하나의 dataframe으로 union

Code

package com.sw.test

import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.{CountVectorizer, IDF, RegexTokenizer}
import org.apache.spark.ml.linalg.SparseVector
import kr.co.shineware.nlp.komoran.constant.DEFAULT_MODEL
import kr.co.shineware.nlp.komoran.core.Komoran
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.expressions.UserDefinedFunction
import scala.collection.JavaConverters._
import java.io.File

import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Vectors, Vector => MLLibVector}
import org.apache.spark.ml.linalg.{Vector => MLVector}

/**
 * @author 민세원
 */

object ProjectApp {

  val outputPath: String = "/home/data/output/20210702"
  val numTerms = 10

  // FIXME : LIGHT vs FULL
  val komoran = new Komoran(DEFAULT_MODEL.LIGHT)

  val getNounsUdf: UserDefinedFunction = udf[Seq[String], String] { sentence => komoran.analyze(sentence).getNouns.asScala }

  val tokenizer: RegexTokenizer = new RegexTokenizer().setInputCol("title").setOutputCol("tokens").setPattern("[ ]")

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("Project Application").getOrCreate()

    val outputDir = new File(outputPath)
    val fileList : List[File] = if (outputDir.exists && outputDir.isDirectory) outputDir.listFiles.filter(_.isFile).toList else List[File]()

    for (f <- fileList) {
      val df = spark.read
        .options(Map("header"->"true", "inferSchema"->"true"))
        .csv(f.getPath)
        .withColumn("gall_id", lit(f.getPath.split("/").last.split(".csv")(0)))

      val outputDF = tokenizer.transform(df)
        .withColumn("nouns", getNounsUdf(col("title")))
        .select(col("gall_id"), col("title"), when(size(col("nouns")) > 0, col("nouns")).otherwise(col("tokens")).as("terms"))
        .filter(size(col("output")) > 0)

      //TF
      val model = new CountVectorizer().setInputCol("terms").setOutputCol("termsFreqs").setVocabSize(numTerms).fit(outputDF)
      val termIds: Array[String] = model.vocabulary

      val docTermsFreqs = model.transform(outputDF)

      docTermsFreqs.cache()

      //IDF
      val idfModel = new IDF().setInputCol("termsFreqs").setOutputCol("tfidfVec").fit(docTermsFreqs)
      val docTermMatrix = idfModel.transform(docTermsFreqs)

      val docRdd = docTermMatrix.select("tfidfVec")

      val bestwords = docRdd.flatMap { ele =>
        var idx = -1
        val ids = ele.getAs[SparseVector](0).toDense.toArray.map(value => {
          idx = idx + 1
          (idx, value)
        })
        ids.filter(v => {v._2 == 0.0}).map{v => termIds(v._1)}
      }
      bestwords.select(collect_list("value")).first().getList[String](0) // => {gall_id, words={}}
      // TODO: union
    }

//    // row ID - doc TITLE
//    //val docIds = docTermsFreqs.rdd.map(_.getString(0)).zipWithUniqueId().map(_.swap).collect().toMap
//    
//    // SVD
//    val vecRdd = docTermMatrix.select("tfidfVec").rdd.map{ row => Vectors.fromML(row.getAs[MLVector]("tfidfVec"))}
//    vecRdd.cache()
//    val mat = new RowMatrix(vecRdd)
//    val k = 1000
//    val svd = mat.computeSVD(k, computeU=true)

    }
}

strange-study / ss-spark

[Project] Scala + Spark 기반 분석 #50

Spark + Scala + Komoran

Todo

Code

정리

시각화

ver.0 (Draft)

ver.1