tpemartin / 110-2-R

The class repo for 110-2 Programming for Data Science
0 stars 3 forks source link

Cleaning Delivery News from 2021 July to 2022 May #19

Open Chious opened 2 years ago

Chious commented 2 years ago

背景介紹

大約在2020年11月,因為外送員薪資新制上路,對於外送員來說許多工資被扣除,因此爆發了大規模的抗爭運動。由於對於台灣來說,外送員這個行業是這幾年來才興起的行業,許多勞資法規仍在興起的階段,因此衍伸出了許多社會議題。包含了勞資糾紛,以及外送制度導致的車禍事件等。

因此大概在去年七月起,我就以外送為關鍵字,透過IFTTT與Google的RSS feed 在網路上爬一些資料。因為前幾次的國畫特徵練習,我突然就想到,能不能也利用那些工具,在這些雜亂的資料中找到些故事。

Data

原始資料是我從去年7月到現在在網路上利用IFTTT爬到的新聞,詳細資料如下:

library(readr)

news1 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news2 <- read_csv("https://www.dropbox.com/s/bp27iqfz2ei8psp/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%281%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news3 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news4 <- read_csv("https://www.dropbox.com/s/7vh72qwbhr05dx8/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%283%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news5 <- read_csv("https://www.dropbox.com/s/tmiwb7583f1bofd/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%284%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news6 <- read_csv("https://www.dropbox.com/s/3qjb90pwaodfqaw/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%285%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)
#Combine severl data in the dataframe

news <- rbind(news1,news2,news3,news4,news5,news6)
colnames(news) <- c("time","title","website_link","content","from","RSS_link")

#convert "time" to Date class
news$time <- lubridate::mdy_hm(news$time)
news$time <- lubridate::as_date(news$time)

變項介紹

  1. time:文章發布時間
  2. title:文章標題
  3. website_link:文章網址
  4. content:關鍵字前後文
  5. from:爬蟲來源
  6. RSS_link:RSS feed網址

未來預計想整理

  1. 想透過line及標題去找出這些新聞的來源為何
  2. 透過特別的關鍵字or特別的時間點:如車禍、罷工等,去看說這些社會事件在哪些時間點有被呈現,且在不同媒體的文章中,如何被論述。

參考工具

  1. library(ggplot2)

iThome. (n.d.). [Day 28] R語言_ggplot2長條圖的吹毛求疵(百分比累積長條圖加上文字,網路上沒找到相關的寫法,我自己來寫吧). iT 邦幫忙::一起幫忙解決難題,拯救 IT 人的一天. Retrieved June 15, 2022, from https://ithelp.ithome.com.tw/articles/10209002

Lin, 林建甫 Jeff. (n.d.). Chapter 5 資料視覺化分析 | R 資料科學與統計. Retrieved June 16, 2022, from https://bookdown.org/jefflinmd38/r4biost/dataviz.html

  1. library(stringr) https://happyjayxin.medium.com/r-%E8%AA%9E%E8%A8%80%E5%AD%97%E4%B8%B2%E8%99%95%E7%90%86-%E7%AD%86%E8%A8%98-98eff071b7d0

  2. 文字探勘 https://rpubs.com/JJChiou/textmining_1

如何找出文章關鍵詞(7):textrank演算法. (2020, August 25). 一健事. https://ronaldyick.wordpress.com/2020/08/25/textrank/

Chious commented 2 years ago

STEP1資料導入

library(readr)
library(dplyr)
library(magrittr)

news1 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news2 <- read_csv("https://www.dropbox.com/s/bp27iqfz2ei8psp/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%281%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news3 <- read_csv("https://www.dropbox.com/s/9f4xsiru2al26c0/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%282%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news4 <- read_csv("https://www.dropbox.com/s/7vh72qwbhr05dx8/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%283%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news5 <- read_csv("https://www.dropbox.com/s/tmiwb7583f1bofd/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%284%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

news6 <- read_csv("https://www.dropbox.com/s/3qjb90pwaodfqaw/%E5%A4%96%E9%80%81%E6%96%B0%E8%81%9E%20%285%29%20-%20%E5%B7%A5%E4%BD%9C%E8%A1%A81.csv?dl=1", 
    col_names = FALSE)

資料合併


#Combine severl data in the dataframe

news <- rbind(news1,news2,news3,news4,news5,news6)
colnames(news) <- c("time","title","website_link","content","from","RSS_link")

#convert "time" to Date class
news$time <- lubridate::mdy_hm(news$time)
news$time <- lubridate::as_date(news$time)

STEP2:分辨新聞來

library(stringr)

# 新聞來源 <- data.frame(新聞來源=c("三立新聞","聯合報","醒報","蘋果仁","ATC_Taiwan","奇摩新聞","公民橘報") , 關鍵字=c("ltn","udn","anntw","applealmond","atctwn","yahoo","buzzorange"))
# 可能可以利用title或是website_link去找出處

#Q:為什麼有Missing Value
#如何根據這些值去給予值?

#appledaily <-"蘋果日報"
#nownews  <-"今日新聞"
#walkerland <-"我傳媒"

#ltn <- "自由時報"
#sten <- "三立新聞"
#udn <- "聯合報"
#yahoo <- "奇摩新聞"
#ettoday <- "東森新聞"
#ebc <- "東森新聞"

#chinatime <- "中時新聞"
#ctwant <- "中時新聞"
# ctitv

#ctee <-"工商時報"
#cts <- "華視新聞"
#epochtimes <- "大紀元"
#ftwnews <- "民視新聞"
#businesstoday <- "今周刊"
#gov <- "政府機關"
#pts <- "公視"
#cna <- "中央社"
#tvbs <- "TVBS"
#.org <- "非營利組織"
#civilmedia <- "非營利組織"
#ELSE <- "小報"

# find the source of the web

  news[['來源網址']] <- news$website_link |>
stringr::str_extract("(?<=https://)[^/]+(?=/)")

# google news would retweet some news from internet, so we should find exactly where they come from again

for (i in nrow(news)){
  if (news[i,7]=="www.google.com"){
    news[i,7]<-stringr::str_extract(news[i,3],"(?<=url=https://)[^/]+(?=/)")
  }
}

news[is.na(news)] <- '0'
news[['來源標籤']] <- {
  來源標籤=c()
  for (.x in 1:nrow(news)){
    if(str_detect(news$來源網址[.x], pattern = "ltn", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ltn"
    }else if(str_detect(news$來源網址[.x], pattern = "sten", negate = FALSE)==TRUE){
      來源標籤[.x] <-"sten"
    }else if(str_detect(news$來源網址[.x], pattern = "appledaily", negate = FALSE)==TRUE){
      來源標籤[.x] <-"appledaily"
    }else if(str_detect(news$來源網址[.x], pattern = "nownews", negate = FALSE)==TRUE){
      來源標籤[.x] <-"nownews"
    }else if(str_detect(news$來源網址[.x], pattern = "walkerland", negate = FALSE)==TRUE){
      來源標籤[.x] <-"walkerland"
    }else if(str_detect(news$來源網址[.x], pattern = "udn", negate = FALSE)==TRUE){
      來源標籤[.x] <-"udn"
    }else if(str_detect(news$來源網址[.x], pattern = "yahoo", negate = FALSE)==TRUE){
      來源標籤[.x] <-"yahoo"
    }else if(str_detect(news$來源網址[.x], pattern = "ettoday", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ettoday"
    }else if(str_detect(news$來源網址[.x], pattern = "ebc", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ebc"
    }else if(str_detect(news$來源網址[.x], pattern = "chinatime", negate = FALSE)==TRUE){
      來源標籤[.x] <-"chinatime"
    }else if(str_detect(news$來源網址[.x], pattern = "ctwant", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctwant"
    }else if(str_detect(news$來源網址[.x], pattern = "ctitv", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctitv"
    }else if(str_detect(news$來源網址[.x], pattern = "ctee", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ctee"
    }else if(str_detect(news$來源網址[.x], pattern = "cts", negate = FALSE)==TRUE){
      來源標籤[.x] <-"cts"
    }else if(str_detect(news$來源網址[.x], pattern = "epochtimes", negate = FALSE)==TRUE){
      來源標籤[.x] <-"epochtimes"
    }else if(str_detect(news$來源網址[.x], pattern = "ftwnews", negate = FALSE)==TRUE){
      來源標籤[.x] <-"ftwnews"
    }else if(str_detect(news$來源網址[.x], pattern = "businesstoday", negate = FALSE)==TRUE){
      來源標籤[.x] <-"businesstoday"
    }else if(str_detect(news$來源網址[.x], pattern = "pts", negate = FALSE)==TRUE){
      來源標籤[.x] <-"pts"
    }else if(str_detect(news$來源網址[.x], pattern = ".org", negate = FALSE)==TRUE){
      來源標籤[.x] <-"org"
    }else if(str_detect(news$來源網址[.x], pattern = "gov", negate = FALSE)==TRUE){
      來源標籤[.x] <-"gov"
    }else if(str_detect(news$來源網址[.x], pattern = "cna", negate = FALSE)==TRUE){
      來源標籤[.x] <-"cna"
    }else if(str_detect(news$來源網址[.x], pattern = "tvbs", negate = FALSE)==TRUE){
      來源標籤[.x] <-"tvbs"
    }else if(str_detect(news$來源網址[.x], pattern = "civilmedia", negate = FALSE)==TRUE){
      來源標籤[.x] <-"org"
    }else{
      來源標籤[.x] <-"other"
    }
  }
  來源標籤
}

news$來源標籤 <- factor(news$來源標籤)
news['媒體'] <- recode_factor(news$來源標籤, appledaily="蘋果日報", ebc="東森新聞", pts="公視", businesstoday="今周刊",
                            epochtimes="大紀元", tvbs="TVBS", chinatime="中時新聞", ettoday="東森", udn="聯合報",
                            cna="中央社", gov="政府網站", walkerland="窩客島", ctee="工商時報", ltn="自由時報",
                            yahoo="奇摩新聞", ctitv="中天新聞", nownews="今日新聞", cts="華視", org="非營利組織",
                            ctwant="周刊王", other="其他")

news['媒體'] <- recode_factor(news$媒體,蘋果日報="蘋果日報",周刊王="中時新聞",中天新聞="中時新聞",
  中時新聞="中時新聞",工商時報="中時新聞",東森新聞="東森新聞",東森="東森新聞",今周刊="今周刊",大紀元="大紀元",
  TVBS="TVBS",聯合報="聯合報",中央社="公部門",政府網站="公部門",公視="公部門",自由時報="自由時報",
  奇摩新聞="奇摩新聞",今日新聞="今日新聞",華視="華視",窩客島="其他",其他="其他") 
# Calculate how many post of news in the internet

library(ggplot2)

ggplot(news, aes(x = time , fill = 媒體)) + 
  geom_bar(position = "stack")

媒體.tab <- table(news$媒體)
媒體.freq <- data.frame(媒體.tab)
names(媒體.freq)[1] <- "媒體"
ggplot(data = 媒體.freq, aes(x = "", y = Freq, fill = 媒體)) +
  geom_bar(width = 1, stat = "identity") + 
  coord_polar("y", start = 0)

STEP3:資料視覺化

文字雲(textbank)

#文字探勘
library(dplyr)
library(lubridate)
library(stringr)
library(jiebaR)
library(wordcloud) # 非互動式文字雲
library(wordcloud2) # 互動式文字雲
library(tidyverse)
#remove symbols
my.symbols <- c('</b>','<b>',"[[:punct:]]","\n","►",'nbsp')
news['content.description'] <- gsub(paste(my.symbols, collapse = "|"),"",news$content)

#去除常用詞彙

my.stop.words <- c("的","你","我","他","一","[[:digit:]]","middot","quot")
news$content.description <- gsub(paste(my.stop.words, collapse = "|"), " ", news$content.description)

#Initialize a JiebaR worker
wk <- worker(stop_word = jiebaR::STOPPATH)
#STOPPATH是套件預設的terms
#Add customized terms
customized.terms <- c("外送員","uber eats")
new_user_word(wk, customized.terms)

#segment terms and separate by blank
news.description <- tibble(news_title = news$媒體,news_id = news$website_link, 
description = sapply(as.character(news$content.description), function(char) segment(char,wk) %>% paste(collapse = " "))) 
#分開後再用空白鍵拼湊在一起
head(news.description, 5)

#把row分出來變成很長很長的column 
library(tidytext)
library(magrittr)
#tokenization
tok99 <- function(t) str_split(t, "[ ]{1,}")
tidy.description <- news.description %>% unnest_tokens(word, description, token=tok99)
#新column名稱叫word,要起作用的是description
tidy.description <- tidy.description[nchar(tidy.description$word)>1,]
#只保留字串長度一個字以上
head(tidy.description, 5)

news.words <- tidy.description %>%
group_by(news_title, word) %>%
summarise(word_frequency = n())

news.words
news.words %>%
group_by(news_title) %>%
top_n(10, word_frequency) %>%
ggplot(aes(x = reorder(word, word_frequency), y = word_frequency, fill = news_title)) + geom_col(show.legend = FALSE) + facet_wrap(~news_title, ncol = 2, scales = "free") +
coord_flip() +
theme_bw() +
theme(text = element_text(size = 10,  family="sans"), axis.text.x = element_text(angle = 60, hjust =1))
#找出關鍵詞

library(textrank)
textrank.model <- textrank_keywords(
tidy.description$word, p = 1/3, ngram_max = 2
)
summary(textrank.model)
head(textrank.model$keywords)
# Get all textrank result 針對每個媒體建立textrank圖表
news.names <- unique(tidy.description$news_title)
description.textrank <- NULL

for (name in news.names){
#filter channel descriptions
text.data <- tidy.description %>% filter(news_title == name)
#train text rank model
textrank.model <- textrank_keywords(
text.data$word, p = 1/3, ngram_max = 2
)

#consolidate textrank results
description.textrank <- bind_rows(
description.textrank,
data.frame(
news_title = name,
word = names(textrank.model$pagerank$vector),
text_rank = textrank.model$pagerank$vector
)
)
}
# plot top 10 keywords for each news
description.textrank %>%
arrange(desc(text_rank)) %>%
  group_by(news_title) %>%
  top_n(10, text_rank) %>% 
  ggplot(aes(word, text_rank, fill = news_title)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "Text Rank") +
  facet_wrap(~news_title, ncol = 2, scales = "free") +
  coord_flip() +
  theme_bw() +
  theme(text=element_text(family="sans", size=14),
        axis.text.x = element_text(angle = 60, hjust = 1))
description.textrank %>% group_by(news_title) %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150, family = "新細明體"))
中時新聞<- description.textrank[description.textrank$news_title=="中時新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
非營利組織<- description.textrank[description.textrank$news_title=="非營利組織",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
公部門<- description.textrank[description.textrank$news_title=="公部門",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
其他<- description.textrank[description.textrank$news_title=="其他",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
今日新聞<- description.textrank[description.textrank$news_title=="今日新聞",] %>%
top_n(15, text_rank) %>%
with(wordcloud(word, text_rank, random.order = FALSE, max.words = 150,colors = brewer.pal(8, "Set2"),family = "新細明體"))
#Rake Method
library(widyr)

# get co-occurence matrix based on video_id
description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)
head(description.cooc)

#將文字轉成矩陣,若有一起出現的視為1,其他視為0
# spread to matrix format
description.cooc <- description.cooc %>% spread(item2, n)

rownames(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)

# get word frequency by video_id
description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))
description.wf %>% filter(word == '罷工')
description.wf %>% arrange(desc(word_frequency))
# Calculate degree and score

description.rake <- data.frame(
  channel_title = name,
  word = colnames(description.cooc),
  degree =  colSums(!is.na(description.cooc))) %>% inner_join(description.wf)

description.rake <- description.rake %>% dplyr::mutate(rake = degree / word_frequency)

names(description.rake)[5] <- "score"
#or: description.rake %>% rename(score = "degree/word_frequency")
head(description.rake)

Rake Method(沒有用到)

# Get all rake result 計算每個 news 每個詞語的 rake
description.rake <- NULL

for (name in news.names){
name="奇摩新聞"
text.data <- tidy.description %>% filter(news_title == name)

description.cooc <- text.data %>% pairwise_count(word, news_id, sort = TRUE, upper = FALSE)

description.cooc <- description.cooc %>% spread(item2, n)
row.names(description.cooc) <- description.cooc$item1
description.cooc <- description.cooc %>% select(-item1)

description.wf <- text.data %>% group_by(word) %>% summarise(word_frequency = length(unique(news_id)))

description.rake <- bind_rows(description.rake,
data.frame(
news_title = name,
word = colnames(description.cooc),
degree = colSums(!is.na(description.cooc))) %>%
inner_join(description.wf)
)}

description.rake <- description.rake %>%
  mutate(rake = degree / word_frequency)
head(description.rake)