STEP1資料導入

library(readr)
library(dplyr)
library(magrittr)

news1 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news2 <- read_csv("https://www.dropbox.com/s/bp27iqfz2ei8psp/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%281%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news3 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news4 <- read_csv("https://www.dropbox.com/s/7vh72qwbhr05dx8/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%283%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news5 <- read_csv("https://www.dropbox.com/s/tmiwb7583f1bofd/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%284%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news6 <- read_csv("https://www.dropbox.com/s/3qjb90pwaodfqaw/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%285%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

資料合併


#Combine severl data in the dataframe

news <- rbind(news1,news2,news3,news4,news5,news6)
colnames(news) <- c("time","title","website_link","content","from","RSS_link")

#convert "time" to Date class
news$time <- lubridate::mdy_hm(news$time)
news$time <- lubridate::as_date(news$time)

STEP2:分辨新聞來

library(stringr)

# 新聞來源 <- data.frame(新聞來源=c("三立新聞","聯合報","醒報","蘋果仁","ATC_Taiwan","奇摩新聞","公民橘報") , 關鍵字=c("ltn","udn","anntw","applealmond","atctwn","yahoo","buzzorange"))
# 可能可以利用title或是website_link去找出處

#Q:為什麼有Missing Value
#如何根據這些值去給予值？

#appledaily <-"蘋果日報"
#nownews  <-"今日新聞"
#walkerland <-"我傳媒"

#ltn <- "自由時報"
#sten <- "三立新聞"
#udn <- "聯合報"
#yahoo <- "奇摩新聞"
#ettoday <- "東森新聞"
#ebc <- "東森新聞"

#chinatime <- "中時新聞"
#ctwant <- "中時新聞"
# ctitv

#ctee <-"工商時報"
#cts <- "華視新聞"
#epochtimes <- "大紀元"
#ftwnews <- "民視新聞"
#businesstoday <- "今周刊"
#gov <- "政府機關"
#pts <- "公視"
#cna <- "中央社"
#tvbs <- "TVBS"
#.org <- "非營利組織"
#civilmedia <- "非營利組織"
#ELSE <- "小報"

# find the source of the web

  news[['來源網址']] <- news$website_link |>
stringr::str_extract("(?<=https://)[^/]+(?=/)")

# google news would retweet some news from internet, so we should find exactly where they come from again

for (i in nrow(news)){
  if (news[i,7]=="www.google.com"){
    news[i,7]<-stringr::str_extract(news[i,3],"(?<=url=https://)[^/]+(?=/)")
  }
}

news[is.na(news)] <- '0'
news[['來源標籤']] <- {
  來源標籤=c()
  for (.x in 1:nrow(news)){
    if(str_detect(news$來源網址[.x], pattern = "ltn", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ltn"
    }else if(str_detect(news$來源網址[.x], pattern = "sten", negate = FALSE)==TRUE){
      來源標籤[.x] <-"sten"
    }else if(str_detect(news$來源網址[.x], pattern = "appledaily", negate = FALSE)==TRUE){
      來源標籤[.x] <-"appledaily"
    }else if(str_detect(news$來源網址[.x], pattern = "nownews", negate = FALSE)==TRUE){
      來源標籤[.x] <-"nownews"
    }else if(str_detect(news$來源網址[.x], pattern = "walkerland", negate = FALSE)==TRUE){
      來源標籤[.x] <-"walkerland"
    }else if(str_detect(news$來源網址[.x], pattern = "udn", negate = FALSE)==TRUE){
      來源標籤[.x] <-"udn"
    }else if(str_detect(news$來源網址[.x], pattern = "yahoo", negate = FALSE)==TRUE){
      來源標籤[.x] <-"yahoo"
    }else if(str_detect(news$來源網址[.x], pattern = "ettoday", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ettoday"
    }else if(str_detect(news$來源網址[.x], pattern = "ebc", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ebc"
    }else if(str_detect(news$來源網址[.x], pattern = "chinatime", negate = FALSE)==TRUE){
      來源標籤[.x] <-"chinatime"
    }else if(str_detect(news$來源網址[.x], pattern = "ctwant", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctwant"
    }else if(str_detect(news$來源網址[.x], pattern = "ctitv", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctitv"
    }else if(str_detect(news$來源網址[.x], pattern = "ctee", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctee"
    }else if(str_detect(news$來源網址[.x], pattern = "cts", negate = FALSE)==TRUE){
      來源標籤[.x] <-"cts"
    }else if(str_detect(news$來源網址[.x], pattern = "epochtimes", negate = FALSE)==TRUE){
      來源標籤[.x] <-"epochtimes"
    }else if(str_detect(news$來源網址[.x], pattern = "ftwnews", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ftwnews"
    }else if(str_detect(news$來源網址[.x], pattern = "businesstoday", negate = FALSE)==TRUE){
      來源標籤[.x] <-"businesstoday"
    }else if(str_detect(news$來源網址[.x], pattern = "pts", negate = FALSE)==TRUE){
      來源標籤[.x] <-"pts"
    }else if(str_detect(news$來源網址[.x], pattern = ".org", negate = FALSE)==TRUE){
      來源標籤[.x] <-"org"
    }else if(str_detect(news$來源網址[.x], pattern = "gov", negate = FALSE)==TRUE){
      來源標籤[.x] <-"gov"
    }else if(str_detect(news$來源網址[.x], pattern = "cna", negate = FALSE)==TRUE){
      來源標籤[.x] <-"cna"
    }else if(str_detect(news$來源網址[.x], pattern = "tvbs", negate = FALSE)==TRUE){
      來源標籤[.x] <-"tvbs"
    }else if(str_detect(news$來源網址[.x], pattern = "civilmedia", negate = FALSE)==TRUE){
      來源標籤[.x] <-"org"
    }else{
      來源標籤[.x] <-"other"
    }
  }
  來源標籤
}

news$來源標籤 <- factor(news$來源標籤)
news['媒體'] <- recode_factor(news$來源標籤, appledaily="蘋果日報", ebc="東森新聞", pts="公視", businesstoday="今周刊",
                            epochtimes="大紀元", tvbs="TVBS", chinatime="中時新聞", ettoday="東森", udn="聯合報",
                            cna="中央社", gov="政府網站", walkerland="窩客島", ctee="工商時報", ltn="自由時報",
                            yahoo="奇摩新聞", ctitv="中天新聞", nownews="今日新聞", cts="華視", org="非營利組織",
                            ctwant="周刊王", other="其他")

news['媒體'] <- recode_factor(news$媒體,蘋果日報="蘋果日報",周刊王="中時新聞",中天新聞="中時新聞",
  中時新聞="中時新聞",工商時報="中時新聞",東森新聞="東森新聞",東森="東森新聞",今周刊="今周刊",大紀元="大紀元",
  TVBS="TVBS",聯合報="聯合報",中央社="公部門",政府網站="公部門",公視="公部門",自由時報="自由時報",
  奇摩新聞="奇摩新聞",今日新聞="今日新聞",華視="華視",窩客島="其他",其他="其他")

# Calculate how many post of news in the internet

library(ggplot2)

ggplot(news, aes(x = time , fill = 媒體)) + 
  geom_bar(position = "stack")

媒體.tab <- table(news$媒體)
媒體.freq <- data.frame(媒體.tab)
names(媒體.freq)[1] <- "媒體"
ggplot(data = 媒體.freq, aes(x = "", y = Freq, fill = 媒體)) +
  geom_bar(width = 1, stat = "identity") + 
  coord_polar("y", start = 0)

STEP3:資料視覺化

文字雲（textbank）

#文字探勘
library(dplyr)
library(lubridate)
library(stringr)
library(jiebaR)
library(wordcloud) # 非互動式文字雲
library(wordcloud2) # 互動式文字雲
library(tidyverse)

#remove symbols
my.symbols <- c('</b>','<b>',"[[:punct:]]","\n","►",'nbsp')
news['content.description'] <- gsub(paste(my.symbols, collapse = "|"),"",news$content)

#去除常用詞彙

my.stop.words <- c("的","你","我","他","一","[[:digit:]]","middot","quot")
news$content.description <- gsub(paste(my.stop.words, collapse = "|"), " ", news$content.description)

#Initialize a JiebaR worker
wk <- worker(stop_word = jiebaR::STOPPATH)
#STOPPATH是套件預設的terms
#Add customized terms
customized.terms <- c("外送員","uber eats")
new_user_word(wk, customized.terms)

#segment terms and separate by blank
news.description <- tibble(news_title = news$媒體,news_id = news$website_link, 
description = sapply(as.character(news$content.description), function(char) segment(char,wk) %>% paste(collapse = " "))) 
#分開後再用空白鍵拼湊在一起
head(news.description, 5)

#把row分出來變成很長很長的column 
library(tidytext)
library(magrittr)
#tokenization
tok99 <- function(t) str_split(t, "[ ]{1,}")
tidy.description <- news.description %>% unnest_tokens(word, description, token=tok99)
#新column名稱叫word，要起作用的是description
tidy.description <- tidy.description[nchar(tidy.description$word)>1,]
#只保留字串長度一個字以上
head(tidy.description, 5)


news.words <- tidy.description %>%
group_by(news_title, word) %>%
summarise(word_frequency = n())

news.words

news.words %>%
group_by(news_title) %>%
top_n(10, word_frequency) %>%
ggplot(aes(x = reorder(word, word_frequency), y = word_frequency, fill = news_title)) + geom_col(show.legend = FALSE) + facet_wrap(~news_title, ncol = 2, scales = "free") +
coord_flip() +
theme_bw() +
theme(text = element_text(size = 10,  family="sans"), axis.text.x = element_text(angle = 60, hjust =1))

#找出關鍵詞

library(textrank)
textrank.model <- textrank_keywords(
tidy.description$word, p = 1/3, ngram_max = 2
)
summary(textrank.model)
head(textrank.model$keywords)

# Get all textrank result 針對每個媒體建立textrank圖表
news.names <- unique(tidy.description$news_title)
description.textrank <- NULL

for (name in news.names){
#filter channel descriptions
text.data <- tidy.description %>% filter(news_title == name)
#train text rank model
textrank.model <- textrank_keywords(
text.data$word, p = 1/3, ngram_max = 2
)

#consolidate textrank results
description.textrank <- bind_rows(
description.textrank,
data.frame(
news_title = name,
word = names(textrank.model$pagerank$vector),
text_rank = textrank.model$pagerank$vector
)
)
}

# plot top 10 keywords for each news
description.textrank %>%
arrange(desc(text_rank)) %>%
  group_by(news_title) %>%
  top_n(10, text_rank) %>% 
  ggplot(aes(word, text_rank, fill = news_title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Text Rank") +
  facet_wrap(~news_title, ncol = 2, scales = "free") +
  coord_flip() +
  theme_bw() +
  theme(text=element_text(family="sans", size=14),
        axis.text.x = element_text(angle = 60, hjust = 1))

description.textrank %>% group_by(news_title) %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150, family = "新細明體"))

中時新聞<- description.textrank[description.textrank$news_title=="中時新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))

非營利組織<- description.textrank[description.textrank$news_title=="非營利組織",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))

公部門<- description.textrank[description.textrank$news_title=="公部門",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))

其他<- description.textrank[description.textrank$news_title=="其他",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))

今日新聞<- description.textrank[description.textrank$news_title=="今日新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))

#Rake Method
library(widyr)

# get co-occurence matrix based on video_id
description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)
head(description.cooc)

#將文字轉成矩陣，若有一起出現的視為1，其他視為0
# spread to matrix format
description.cooc <- description.cooc %>% spread(item2, n)

rownames(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)

# get word frequency by video_id
description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))

description.wf %>% filter(word == '罷工')
description.wf %>% arrange(desc(word_frequency))

# Calculate degree and score

description.rake <- data.frame(
  channel_title = name,
  word = colnames(description.cooc),
  degree =  colSums(!is.na(description.cooc))) %>% inner_join(description.wf)

description.rake <- description.rake %>% dplyr::mutate(rake = degree / word_frequency)

names(description.rake)[5] <- "score"
#or: description.rake %>% rename(score = "degree/word_frequency")
head(description.rake)

Rake Method(沒有用到)

# Get all rake result 計算每個 news 每個詞語的 rake
description.rake <- NULL

for (name in news.names){
name="奇摩新聞"
text.data <- tidy.description %>% filter(news_title == name)

description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)

description.cooc <- description.cooc %>% spread(item2, n)
row.names(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)

description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))

description.rake <- bind_rows(description.rake,
data.frame(
news_title = name,
word = colnames(description.cooc),
degree = colSums(!is.na(description.cooc))) %>%
inner_join(description.wf)
)}

description.rake <- description.rake %>%
  mutate(rake = degree / word_frequency)
head(description.rake)

tpemartin / 110-2-R

Cleaning Delivery News from 2021 July to 2022 May #19

背景介紹

Data

變項介紹

未來預計想整理

參考工具

STEP1資料導入

資料合併

STEP2:分辨新聞來

STEP3:資料視覺化

文字雲（textbank）

Rake Method(沒有用到)