ESSECAnalytics / BDA

Course: Big Data Analytics (IDSS31165)
http://strategic-business-analytics-chair.essec.edu
3 stars 3 forks source link

How to get a free bounty in big data analytics #3

Open eakl opened 7 years ago

eakl commented 7 years ago

Original Post by SaturnMusic

How to get a free bounty in big data analytics

eakl commented 7 years ago

Original Post by nicogla

Congratulations!

Here is another (simpler) version of the code:

library(maps)
usmap<-map('state')
searchtext<-'@realDonaldTrump'
DT.tweets=searchTwitter(searchtext,n=10000,geocode='40.375,-100,1500mi')
DT.DF=twListToDF(DT.tweets)
SelDT.DF=DT.DF[!is.na(DT.DF$longitude),]
usmap<-map('state')
points(SelDT.DF$longitude, SelDT.DF$latitude, pch=19, col="red", cex=0.5)

And this is the resulting map: http://imgur.com/cyC1UNt

eakl commented 7 years ago

Original Post by RomainB_

So I adapted the previous code, I hope it's OK

library(devtools)
install_github("twitteR", username="geoffjentry") # We use a workaround for the connection library(twitteR)
api_key = "YOURKEY"
api_secret = "YOURKEY"
access_token = "YOURKEY"
access_token_secret = "YOURKEY"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
library(maps)
usmap<-map('state')
searchtext<-'@HillaryClinton' DT.tweets=searchTwitter(searchtext,n=5000,geocode='40.375,-100,1500mi') DT.DF=twListToDF(DT.tweets)
SelDT.DF=DT.DF[!is.na(DT.DF$longitude),]
usmap<-map('state')
points(SelDT.DF$longitude, SelDT.DF$latitude, pch=19, col="blue", cex=0.5)
searchtext2<-'@realDonaldTrump' DT.tweets2=searchTwitter(searchtext2,n=5000,geocode='40.375,-100,1500mi') DT.DF2=twListToDF(DT.tweets2)
SelDT.DF2=DT.DF2[!is.na(DT.DF2$longitude),]
points(SelDT.DF2$longitude, SelDT.DF2$latitude, pch=19, col="red", cex=0.5)
eakl commented 7 years ago

Original Post by AlexGuen

Map Sentiment Analysis Grey = Neutral Red = Bad SteelBlue = Good

usmap<-map('state')
searchtext<-'@realDonaldTrump'
DT.tweets=searchTwitter(searchtext,n=10000,geocode='40.375,-100,1500mi')
DT.DF=twListToDF(DT.tweets)

# 2.2 Extract text from lexicons
pos.words = scan('positive-words.txt',what='character', comment.char=';')
neg.words = scan('negative-words.txt',what='character', comment.char=';')

# 3.1.2 Write in function to score sentiment
library(plyr)

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{ # function to score the sentiments
  require(plyr)
  require(stringr)

  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array ("a") of scores back, so we use 
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {

    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)

    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)

    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)

    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)

    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)

    return(score)
  }, pos.words, neg.words, .progress=.progress )

  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

clean.tweets <- function(tweets.df){ # Function to clean the data
  twlist<-twListToDF(tweets.df)
  datatemp <- unlist(strsplit(twlist$text, split=", "))
  # remove usernames
  datatemp<-gsub("@[[:alnum:]]*","",datatemp)
  # to ASCII
  datatemp <- iconv(datatemp, "latin1", "ASCII", sub="")
  datatemp <- str_replace_all(datatemp,"[^[:graph:]]", " ") 
  # remove punctuation
  datatemp<-gsub("[[:punct:]]", "", datatemp)
  # remove htpp
  datatemp<-gsub("http[[:alnum:]]*","",datatemp)
  # remove numbers 
  datatemp<-gsub("\\d", "",datatemp)
  # remove unrecognized chars
  datatemp<-gsub("�", "",datatemp)
  # remove "stop words"
  myStopWords <-c(stopwords('english'))
  datatemp<-removeWords(datatemp,myStopWords)
  # Strip whitespace
  datatemp<-stripWhitespace(datatemp)
  # to lowercase
  datatemp <-tolower(datatemp)
  return(datatemp)
}
# 4.1 Score tweets' sentiment
library(stringr)

DT.score=score.sentiment(clean.tweets(DT.tweets), pos.words, neg.words, .progress='text')
DT.score$Index <- 1:nrow(DT.score)
DT.DF$Index <- 1:nrow(DT.DF)
SelDT.DF=DT.DF[!is.na(DT.DF$longitude),]

SelDT.DF$score <- DT.score$score[match(SelDT.DF$Index, DT.score$Index)]
SelDT.DF$Opinion <- "Neutral"
SelDT.DF$Opinion[which(SelDT.DF$score>0)] <- "Good"
SelDT.DF$Opinion[which(SelDT.DF$score<0)] <- "Bad"

usmap<-map('state')
points(SelDT.DF$longitude[which(SelDT.DF$Opinion=="Neutral")], SelDT.DF$latitude[which(SelDT.DF$Opinion=="Neutral")], pch=19, col="grey", cex=0.5)
points(SelDT.DF$longitude[which(SelDT.DF$Opinion=="Bad")], SelDT.DF$latitude[which(SelDT.DF$Opinion=="Bad")], pch=19, col="red", cex=0.5)
points(SelDT.DF$longitude[which(SelDT.DF$Opinion=="Good")], SelDT.DF$latitude[which(SelDT.DF$Opinion=="Good")], pch=19, col="steelblue", cex=0.5)