sebastianbarfort / sds

Social Data Science, course at University of Copenhagen
http://sebastianbarfort.github.io/sds/
12 stars 17 forks source link

Group 10: Assignment 2 #53

Closed sophiewenzelhorsted closed 8 years ago

sophiewenzelhorsted commented 8 years ago

title: "Assignment 2 - Bribes in India" author: "Group 10" date: "9. nov. 2015" output: html_document: fig_caption: yes fig_height: 6 fig_width: 7

word_document: no


library(dplyr)
library(XML)
library(RCurl)
library(knitr)
library(rvest)
library(readr)
library(ggplot2)
library(zoo)
library(stringr)
library(countrycode)
library(StandardizeText)
library(rex)
library(data.table)
library(ggmap)
library(maps)
library(tidyr)
library(lubridate)
library(reshape2)
options(scipen=999)

//:(#ASSIGNMENT 2) //:(#Data rename)


#data.til.ass.2.version3<-read.delim("~/Dropbox/Polit/E15/SDS10/data.til.ass.2.version3.csv")
data.til.ass.2.version3<-read.delim("https://raw.githubusercontent.com/RasmusRJensen/sds_as2_data_group10/master/data.as.2.version3.csv")

df=data.til.ass.2.version3

//:(#Generating correct variables)

df$date = as.character(df$date)
df$date = str_replace_all(df$date, ",", "") 
df$date = str_replace_all(df$date, "October", "10") 
df$date = str_replace_all(df$date, "November", "11") 
df$date = as.Date(df$date, "%m %d %Y") 
df$paid.amount = extract_numeric(df$paid) 
df$views = extract_numeric(df$views)

//:(#Frequency by sector)

sector=na.omit(group_by(df,sector))%>% tally(sort=TRUE)

//:(#Very few observations in some of the sectors, so combine them to other_sector)

sector$sector=as.character.factor(sector$sector)
sector$sector_2[sector$n<=20]="Other sector" 
sector$sector_2[sector$n>=21]=sector$sector

//:(#Frequency by issue)

issue=na.omit(group_by(df,issue))%>%tally(sort=TRUE)

//:(#Very few observations in some of the issues, so combine them to other_issue)

issue$issue=as.character.factor(issue$issue)
issue$issue_2[issue$n<=20]="Other issue" 
issue$issue_2[issue$n>=21]=issue$issue

df_1=merge(df,sector,by="sector")
df_1=merge(df_1,issue,by="issue")
df_1$n.x=NULL
df_1$n.y=NULL

//:(#Delete outlier)

df_1=df_1[-c(39),]

//:(#Group by sector and calculate avg. paid amount)

paid_sector=aggregate( paid.amount~sector_2, df_1, mean) 

//:(#Group by issue and calculate acg. paid amount )

paid_issue=aggregate( paid.amount~issue_2, df_1, mean)

//:(#Group by sector+issue and calculate avg. paid amount)

paid_sector_issue=aggregate( paid.amount~sector_2+issue_2, df_1, mean)          

In the following we will present four data visualizations in order to outline the most important information about reported bribery in India. The data is collected November 5th, 2015, from the 1000 most recent reports on www.ipaidabribe.com. In figure 1 and figure 2 we have choosen to make the categories "Other issue" and "Other sector" in which we have aggregated the groups with less than twenty cases of bribery. This is done to give a better overview of the data. Code for Scraping is included at the end of the .Rmd file code.

//:(#Cross between issue and paid amount)

p1=ggplot(paid_issue, aes(x=reorder(issue_2,-paid.amount),y=paid.amount))
p1=p1+geom_bar(stat="identity", fill="black")+labs(x=NULL, y="INR", title ="Figure 1. Amount paid, regarding issue")
p1=p1+theme(axis.text.x=element_text (angle = 90, hjust = 1))
p1 = p1 + coord_flip()
p1

Figure 1 shows that most of the bribes have been regarding New PAN Cards followed by Registration of Property and Traffic Violations.

//:(#Cross between sector and paid amount )

p2=ggplot(paid_sector, aes(x=reorder(sector_2,-paid.amount), y=paid.amount))
p2=p2+geom_bar(stat="identity", fill="black")+labs(x=NULL, y="INR", title="Figure 2. Amount paid, regarding sector")
p2=p2+theme(axis.text.x=element_text (angle = 90, hjust = 1))
p2=p2+coord_flip()
p2

Figure 2 shows that Income tax is the sector where most of the bribery takes place. This is very much in line with the fact that new PAN cards were the issue for most bribes according to figure 1. The Permanent Account Number (PAN) card proofs that an individual is an income tax assessee, but according to data tax payers resort to paying bribes to get the PAN card faster.

#Generate city and state. Sub ',' out
df_1<-mutate(df_1, city=gsub(x=df_1$location,pattern=",.*","")) 
df_1<-mutate(df_1, state=gsub(x=df_1$location,pattern=".*,",""))

//:(#Remove whitespace - see http://stackoverflow.com/questions/2261079/how-to-trim-leading-and-trailing-whitespace-in-r)

trim <- function (x) gsub("^\\s+|\\s+$", "", x) #defines function for trimming
df_1$city<-trim(df_1$city)

#Get Dataset with cities.
data(world.cities, package = "maps")
india_cities<-world.cities  
#Filter for Indian cities
india_cities<-filter(india_cities, country.etc=="India")
names(india_cities)<-c("city","country","pop","lat","long","capital")
#Change name of New delhi and Bombay to current names. 
india_cities$city<-gsub(x=india_cities$city,pattern="New Delhi","Delhi")
india_cities$city<-gsub(x=india_cities$city,pattern="Bombay","Mumbai")
pp1<-group_by(df_1, city) %>% summarise(n=n(),mean_amount=mean(paid.amount),median_amount=median(paid.amount))

pp1<- left_join(pp1, india_cities, by="city")

ggmap(get_map(location = 'India', zoom=5)) + geom_point(data=pp1,aes(x=long, y=lat, size=mean_amount)) + labs(x = NULL, y = NULL, title = "Figure 3. Mean amount paid") 

Figure 3 shows that the largest average amount are paid in Vidisha, Gonda and Baran. Note that the mean bribe amount is relativly small in the two large cities Mumbai and Delhi. One explanation for this could be, that the size of the city increases the 'competition' in bribes. The 'supply' of people to bribe is larger. The large amount of people could also increase the pool of honest people, so that there are more oppurtunities for avoiding the need to pay bribes.

colnames(pp1)[2] <- "Frequency"
ggmap(get_map(location = 'India', zoom=5)) + geom_point(data=pp1,aes(x=long, y=lat, size=Frequency)) + labs(x=NULL, y=NULL, title = "Figure 4. Number of cases")

Figure 4 shows that most bribery cases are taking place in Bangalore, which is the capital of the Indian state of Karnataka. Bangalore is the third most populous city in India and is the leading IT exporter. The reason why Bangalore tops the list of bribery cases is not necessarily because it is more corrupt, but could be because of better awareness about the website, www.ipaidabribe.com.

It is also worth noticing, that the number of bribes is relativly small in Mumbai and Delhi which goes aginst the notion that the supply of people to bribe is large. On the other hand, this could be explained a larger pool of honest people. It should be noted, that this goes aginst the numbers from Bangalore.

##Code for scrape
#install.packages("XML")
#install.packages("RCurl")
#install.packages("rvest")
#install.packages("stringr")

#library(dplyr)
#library(rvest)
#library(rvest)
#library(XML)
#library(RCurl)
#library(knitr)
#library(rvest)
#library(plyr)

#subpages=c(10,20,30,40,50,60,70,80,90,100)
#subpages<-seq(from = 10, to = 1000, by = 10)

##Create url for subpages (10-1000)
#for(i in 1:100){
#  url="http://www.ipaidabribe.com/reports/paid?page="
#  del1=subpages[i]
#  pages=paste(url,subpages,sep="")}
#  links=pages

##Create function for scraping
#scrape_bribe = function(link){
#  my.link = read_html(link, encoding = "UTF-8")
#    title = my.link %>% 
#    html_nodes(".heading-3 a") %>% html_text() 
#    sector = my.link %>% 
#    html_nodes(".name a") %>% html_text() 
#    issue = my.link %>%
#    html_nodes(".transaction a") %>% html_text()
#    paid = my.link %>%
#    html_nodes(".paid-amount span") %>% html_text()
#    date = my.link %>%
#    html_nodes(".date") %>% html_text()
#    views = my.link %>%
#    html_nodes(".overview .views") %>% html_text
#    location = my.link %>%
#    html_nodes(".location") %>% html_text
#    report = my.link %>%
#    html_nodes(".unique-reference") %>% html_text

#return(cbind(title, sector, issue, paid, date, views, location, report))
#}

##Create look for scraping
#my.bribe.data = list() # initialize empty list
#for (i in links[1:100]){
  #print(paste("http://www.econ.ku.dk/ansatte/vip/", i, sep = ""))
#  my.bribe.data[[i]] = scrape_bribe(i)
  # waiting one second between hits
#  Sys.sleep(1)
#  cat(" done!\n")
#}

#df <- ldply(my.bribe.data)
sebastianbarfort commented 8 years ago

Hi Sophie and co,

Good assignment. I like your use of ggmap!

You could perhaps discuss the low number of bribes in the large cities a bit more. Also, you're loading a lot of packages that you don't really use. These are small things though, generally the assignment is well done.

APPROVED