Open TeorinKim opened 2 years ago
install.packages("remotes")
remotes::install_github("mrchypark/multilinguer")
library(multilinguer)
install_java()
multilinguer::install_java()
install.packages("d:/Download/NLP4kec_1.4.0.zip", repos=NULL)
library(NLP4kec) library(rJava)
library(haven)
install.packages("tm")
library(tm)
불용어 <- read.csv(file="d:/ML/stopwords.csv", header = TRUE, sep=",")
result <- file_parser_r(path="d:/ML/헌법.xlsx", language = "ko") 결과 <- gsub(" ", " ", result)
전처리 <- 결과
말뭉치 <- VCorpus(VectorSource(전처리)) 말뭉치 <- tm_map(말뭉치, removePunctuation) 말뭉치 <- tm_map(말뭉치, removeNumbers) 말뭉치 <- tm_map(말뭉치, removeWords, 불용어$stopword) 말뭉치 <- tm_map(말뭉치, PlainTextDocument)
DTM = DocumentTermMatrix(말뭉치, control=list(wordLenths=c(2,Inf))) TDM = DocumentTermMatrix(말뭉치, control=list(wordLenths=c(2,Inf)))
colnames(DTM) <- trimws(colnames(DTM)) DTM <- DTM[,nchar(colnames(DTM)) >1] DTM
빈도 <- colSums(as.matrix(DTM[1,])) 빈도[head(order(-빈도), 32)]
library(ggplot2) library(dplyr)
단어 <- data.frame(단어=names(빈도), freq=빈도)
ggplot(head(arrange(단어, -freq), 20), aes(x=reorder(단어, -freq), 20), y=빈도)+geom_bar(stat="identity")
install.packages("remotes")
remotes::install_github("mrchypark/multilinguer")
library(multilinguer)
install_java()
multilinguer::install_java()
install.packages("d:/Download/NLP4kec_1.4.0.zip", repos=NULL)
library(NLP4kec) library(rJava)
library(haven)
install.packages("tm")
library(tm)
불용어 <- read.csv(file="d:/ML/stopwords.csv", header = TRUE, sep=",")
result <- file_parser_r(path="d:/ML/헌법.xlsx", language = "ko") 결과 <- gsub(" ", " ", result)
전처리 <- 결과
말뭉치 <- VCorpus(VectorSource(전처리)) 말뭉치 <- tm_map(말뭉치, removePunctuation) 말뭉치 <- tm_map(말뭉치, removeNumbers) 말뭉치 <- tm_map(말뭉치, removeWords, 불용어$stopword) 말뭉치 <- tm_map(말뭉치, PlainTextDocument)
DTM = DocumentTermMatrix(말뭉치, control=list(wordLenths=c(2,Inf))) TDM = DocumentTermMatrix(말뭉치, control=list(wordLenths=c(2,Inf)))
colnames(DTM) <- trimws(colnames(DTM)) DTM <- DTM[,nchar(colnames(DTM)) >1] DTM
빈도 <- colSums(as.matrix(DTM[1,])) 빈도[head(order(-빈도), 32)]
library(ggplot2) library(dplyr)
단어 <- data.frame(단어=names(빈도), freq=빈도)
ggplot(head(arrange(단어, -freq), 20), aes(x=reorder(단어, -freq), 20), y=빈도)+geom_bar(stat="identity")