Open Chious opened 2 years ago
library(readr)
library(dplyr)
library(magrittr)
news1 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
news2 <- read_csv("https://www.dropbox.com/s/bp27iqfz2ei8psp/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%281%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
news3 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
news4 <- read_csv("https://www.dropbox.com/s/7vh72qwbhr05dx8/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%283%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
news5 <- read_csv("https://www.dropbox.com/s/tmiwb7583f1bofd/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%284%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
news6 <- read_csv("https://www.dropbox.com/s/3qjb90pwaodfqaw/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%285%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1",
col_names = FALSE)
#Combine severl data in the dataframe
news <- rbind(news1,news2,news3,news4,news5,news6)
colnames(news) <- c("time","title","website_link","content","from","RSS_link")
#convert "time" to Date class
news$time <- lubridate::mdy_hm(news$time)
news$time <- lubridate::as_date(news$time)
library(stringr)
# 新聞來源 <- data.frame(新聞來源=c("三立新聞","聯合報","醒報","蘋果仁","ATC_Taiwan","奇摩新聞","公民橘報") , 關鍵字=c("ltn","udn","anntw","applealmond","atctwn","yahoo","buzzorange"))
# 可能可以利用title或是website_link去找出處
#Q:為什麼有Missing Value
#如何根據這些值去給予值?
#appledaily <-"蘋果日報"
#nownews <-"今日新聞"
#walkerland <-"我傳媒"
#ltn <- "自由時報"
#sten <- "三立新聞"
#udn <- "聯合報"
#yahoo <- "奇摩新聞"
#ettoday <- "東森新聞"
#ebc <- "東森新聞"
#chinatime <- "中時新聞"
#ctwant <- "中時新聞"
# ctitv
#ctee <-"工商時報"
#cts <- "華視新聞"
#epochtimes <- "大紀元"
#ftwnews <- "民視新聞"
#businesstoday <- "今周刊"
#gov <- "政府機關"
#pts <- "公視"
#cna <- "中央社"
#tvbs <- "TVBS"
#.org <- "非營利組織"
#civilmedia <- "非營利組織"
#ELSE <- "小報"
# find the source of the web
news[['來源網址']] <- news$website_link |>
stringr::str_extract("(?<=https://)[^/]+(?=/)")
# google news would retweet some news from internet, so we should find exactly where they come from again
for (i in nrow(news)){
if (news[i,7]=="www.google.com"){
news[i,7]<-stringr::str_extract(news[i,3],"(?<=url=https://)[^/]+(?=/)")
}
}
news[is.na(news)] <- '0'
news[['來源標籤']] <- {
來源標籤=c()
for (.x in 1:nrow(news)){
if(str_detect(news$來源網址[.x], pattern = "ltn", negate = FALSE)==TRUE){
來源標籤[.x] <-"ltn"
}else if(str_detect(news$來源網址[.x], pattern = "sten", negate = FALSE)==TRUE){
來源標籤[.x] <-"sten"
}else if(str_detect(news$來源網址[.x], pattern = "appledaily", negate = FALSE)==TRUE){
來源標籤[.x] <-"appledaily"
}else if(str_detect(news$來源網址[.x], pattern = "nownews", negate = FALSE)==TRUE){
來源標籤[.x] <-"nownews"
}else if(str_detect(news$來源網址[.x], pattern = "walkerland", negate = FALSE)==TRUE){
來源標籤[.x] <-"walkerland"
}else if(str_detect(news$來源網址[.x], pattern = "udn", negate = FALSE)==TRUE){
來源標籤[.x] <-"udn"
}else if(str_detect(news$來源網址[.x], pattern = "yahoo", negate = FALSE)==TRUE){
來源標籤[.x] <-"yahoo"
}else if(str_detect(news$來源網址[.x], pattern = "ettoday", negate = FALSE)==TRUE){
來源標籤[.x] <-"ettoday"
}else if(str_detect(news$來源網址[.x], pattern = "ebc", negate = FALSE)==TRUE){
來源標籤[.x] <-"ebc"
}else if(str_detect(news$來源網址[.x], pattern = "chinatime", negate = FALSE)==TRUE){
來源標籤[.x] <-"chinatime"
}else if(str_detect(news$來源網址[.x], pattern = "ctwant", negate = FALSE)==TRUE){
來源標籤[.x] <-"ctwant"
}else if(str_detect(news$來源網址[.x], pattern = "ctitv", negate = FALSE)==TRUE){
來源標籤[.x] <-"ctitv"
}else if(str_detect(news$來源網址[.x], pattern = "ctee", negate = FALSE)==TRUE){
來源標籤[.x] <-"ctee"
}else if(str_detect(news$來源網址[.x], pattern = "cts", negate = FALSE)==TRUE){
來源標籤[.x] <-"cts"
}else if(str_detect(news$來源網址[.x], pattern = "epochtimes", negate = FALSE)==TRUE){
來源標籤[.x] <-"epochtimes"
}else if(str_detect(news$來源網址[.x], pattern = "ftwnews", negate = FALSE)==TRUE){
來源標籤[.x] <-"ftwnews"
}else if(str_detect(news$來源網址[.x], pattern = "businesstoday", negate = FALSE)==TRUE){
來源標籤[.x] <-"businesstoday"
}else if(str_detect(news$來源網址[.x], pattern = "pts", negate = FALSE)==TRUE){
來源標籤[.x] <-"pts"
}else if(str_detect(news$來源網址[.x], pattern = ".org", negate = FALSE)==TRUE){
來源標籤[.x] <-"org"
}else if(str_detect(news$來源網址[.x], pattern = "gov", negate = FALSE)==TRUE){
來源標籤[.x] <-"gov"
}else if(str_detect(news$來源網址[.x], pattern = "cna", negate = FALSE)==TRUE){
來源標籤[.x] <-"cna"
}else if(str_detect(news$來源網址[.x], pattern = "tvbs", negate = FALSE)==TRUE){
來源標籤[.x] <-"tvbs"
}else if(str_detect(news$來源網址[.x], pattern = "civilmedia", negate = FALSE)==TRUE){
來源標籤[.x] <-"org"
}else{
來源標籤[.x] <-"other"
}
}
來源標籤
}
news$來源標籤 <- factor(news$來源標籤)
news['媒體'] <- recode_factor(news$來源標籤, appledaily="蘋果日報", ebc="東森新聞", pts="公視", businesstoday="今周刊",
epochtimes="大紀元", tvbs="TVBS", chinatime="中時新聞", ettoday="東森", udn="聯合報",
cna="中央社", gov="政府網站", walkerland="窩客島", ctee="工商時報", ltn="自由時報",
yahoo="奇摩新聞", ctitv="中天新聞", nownews="今日新聞", cts="華視", org="非營利組織",
ctwant="周刊王", other="其他")
news['媒體'] <- recode_factor(news$媒體,蘋果日報="蘋果日報",周刊王="中時新聞",中天新聞="中時新聞",
中時新聞="中時新聞",工商時報="中時新聞",東森新聞="東森新聞",東森="東森新聞",今周刊="今周刊",大紀元="大紀元",
TVBS="TVBS",聯合報="聯合報",中央社="公部門",政府網站="公部門",公視="公部門",自由時報="自由時報",
奇摩新聞="奇摩新聞",今日新聞="今日新聞",華視="華視",窩客島="其他",其他="其他")
# Calculate how many post of news in the internet
library(ggplot2)
ggplot(news, aes(x = time , fill = 媒體)) +
geom_bar(position = "stack")
媒體.tab <- table(news$媒體)
媒體.freq <- data.frame(媒體.tab)
names(媒體.freq)[1] <- "媒體"
ggplot(data = 媒體.freq, aes(x = "", y = Freq, fill = 媒體)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y", start = 0)
#文字探勘
library(dplyr)
library(lubridate)
library(stringr)
library(jiebaR)
library(wordcloud) # 非互動式文字雲
library(wordcloud2) # 互動式文字雲
library(tidyverse)
#remove symbols
my.symbols <- c('</b>','<b>',"[[:punct:]]","\n","►",'nbsp')
news['content.description'] <- gsub(paste(my.symbols, collapse = "|"),"",news$content)
#去除常用詞彙
my.stop.words <- c("的","你","我","他","一","[[:digit:]]","middot","quot")
news$content.description <- gsub(paste(my.stop.words, collapse = "|"), " ", news$content.description)
#Initialize a JiebaR worker
wk <- worker(stop_word = jiebaR::STOPPATH)
#STOPPATH是套件預設的terms
#Add customized terms
customized.terms <- c("外送員","uber eats")
new_user_word(wk, customized.terms)
#segment terms and separate by blank
news.description <- tibble(news_title = news$媒體,news_id = news$website_link,
description = sapply(as.character(news$content.description), function(char) segment(char,wk) %>% paste(collapse = " ")))
#分開後再用空白鍵拼湊在一起
head(news.description, 5)
#把row分出來變成很長很長的column
library(tidytext)
library(magrittr)
#tokenization
tok99 <- function(t) str_split(t, "[ ]{1,}")
tidy.description <- news.description %>% unnest_tokens(word, description, token=tok99)
#新column名稱叫word,要起作用的是description
tidy.description <- tidy.description[nchar(tidy.description$word)>1,]
#只保留字串長度一個字以上
head(tidy.description, 5)
news.words <- tidy.description %>%
group_by(news_title, word) %>%
summarise(word_frequency = n())
news.words
news.words %>%
group_by(news_title) %>%
top_n(10, word_frequency) %>%
ggplot(aes(x = reorder(word, word_frequency), y = word_frequency, fill = news_title)) + geom_col(show.legend = FALSE) + facet_wrap(~news_title, ncol = 2, scales = "free") +
coord_flip() +
theme_bw() +
theme(text = element_text(size = 10, family="sans"), axis.text.x = element_text(angle = 60, hjust =1))
#找出關鍵詞
library(textrank)
textrank.model <- textrank_keywords(
tidy.description$word, p = 1/3, ngram_max = 2
)
summary(textrank.model)
head(textrank.model$keywords)
# Get all textrank result 針對每個媒體建立textrank圖表
news.names <- unique(tidy.description$news_title)
description.textrank <- NULL
for (name in news.names){
#filter channel descriptions
text.data <- tidy.description %>% filter(news_title == name)
#train text rank model
textrank.model <- textrank_keywords(
text.data$word, p = 1/3, ngram_max = 2
)
#consolidate textrank results
description.textrank <- bind_rows(
description.textrank,
data.frame(
news_title = name,
word = names(textrank.model$pagerank$vector),
text_rank = textrank.model$pagerank$vector
)
)
}
# plot top 10 keywords for each news
description.textrank %>%
arrange(desc(text_rank)) %>%
group_by(news_title) %>%
top_n(10, text_rank) %>%
ggplot(aes(word, text_rank, fill = news_title)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "Text Rank") +
facet_wrap(~news_title, ncol = 2, scales = "free") +
coord_flip() +
theme_bw() +
theme(text=element_text(family="sans", size=14),
axis.text.x = element_text(angle = 60, hjust = 1))
description.textrank %>% group_by(news_title) %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150, family = "新細明體"))
中時新聞<- description.textrank[description.textrank$news_title=="中時新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
非營利組織<- description.textrank[description.textrank$news_title=="非營利組織",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
公部門<- description.textrank[description.textrank$news_title=="公部門",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
其他<- description.textrank[description.textrank$news_title=="其他",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
今日新聞<- description.textrank[description.textrank$news_title=="今日新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
#Rake Method
library(widyr)
# get co-occurence matrix based on video_id
description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)
head(description.cooc)
#將文字轉成矩陣,若有一起出現的視為1,其他視為0
# spread to matrix format
description.cooc <- description.cooc %>% spread(item2, n)
rownames(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)
# get word frequency by video_id
description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))
description.wf %>% filter(word == '罷工')
description.wf %>% arrange(desc(word_frequency))
# Calculate degree and score
description.rake <- data.frame(
channel_title = name,
word = colnames(description.cooc),
degree = colSums(!is.na(description.cooc))) %>% inner_join(description.wf)
description.rake <- description.rake %>% dplyr::mutate(rake = degree / word_frequency)
names(description.rake)[5] <- "score"
#or: description.rake %>% rename(score = "degree/word_frequency")
head(description.rake)
# Get all rake result 計算每個 news 每個詞語的 rake
description.rake <- NULL
for (name in news.names){
name="奇摩新聞"
text.data <- tidy.description %>% filter(news_title == name)
description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)
description.cooc <- description.cooc %>% spread(item2, n)
row.names(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)
description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))
description.rake <- bind_rows(description.rake,
data.frame(
news_title = name,
word = colnames(description.cooc),
degree = colSums(!is.na(description.cooc))) %>%
inner_join(description.wf)
)}
description.rake <- description.rake %>%
mutate(rake = degree / word_frequency)
head(description.rake)
背景介紹
大約在2020年11月,因為外送員薪資新制上路,對於外送員來說許多工資被扣除,因此爆發了大規模的抗爭運動。由於對於台灣來說,外送員這個行業是這幾年來才興起的行業,許多勞資法規仍在興起的階段,因此衍伸出了許多社會議題。包含了勞資糾紛,以及外送制度導致的車禍事件等。
因此大概在去年七月起,我就以外送為關鍵字,透過IFTTT與Google的RSS feed 在網路上爬一些資料。因為前幾次的國畫特徵練習,我突然就想到,能不能也利用那些工具,在這些雜亂的資料中找到些故事。
Data
原始資料是我從去年7月到現在在網路上利用IFTTT爬到的新聞,詳細資料如下:
變項介紹
未來預計想整理
參考工具
iThome. (n.d.). [Day 28] R語言_ggplot2長條圖的吹毛求疵(百分比累積長條圖加上文字,網路上沒找到相關的寫法,我自己來寫吧). iT 邦幫忙::一起幫忙解決難題,拯救 IT 人的一天. Retrieved June 15, 2022, from https://ithelp.ithome.com.tw/articles/10209002
Lin, 林建甫 Jeff. (n.d.). Chapter 5 資料視覺化分析 | R 資料科學與統計. Retrieved June 16, 2022, from https://bookdown.org/jefflinmd38/r4biost/dataviz.html
library(stringr) https://happyjayxin.medium.com/r-%E8%AA%9E%E8%A8%80%E5%AD%97%E4%B8%B2%E8%99%95%E7%90%86-%E7%AD%86%E8%A8%98-98eff071b7d0
文字探勘 https://rpubs.com/JJChiou/textmining_1
如何找出文章關鍵詞(7):textrank演算法. (2020, August 25). 一健事. https://ronaldyick.wordpress.com/2020/08/25/textrank/