sebastianbarfort / sds

Social Data Science, course at University of Copenhagen
http://sebastianbarfort.github.io/sds/
12 stars 17 forks source link

Group 8: Assignment 1 #22

Closed sundgaard closed 8 years ago

sundgaard commented 8 years ago
ASSIGNMENT 1: DATA MANIPULATION AND VISUALIZATION

Group 8

Hand-in, October 8 2015

Group Members:

Dennis Hansen

Susanne Sundgaard Hansen

Oskar Harmsen

Ann-Sofie Hansen

Fix international characters (only works on MAC)

Sys.setlocale("LC_ALL", "pt_PT.UTF-8")

Load packages

library("readr") library("lubridate") library("dplyr") library("ggplot2") library("XML") library("httr") library("stringr")

Load file

df = tbl_df(read.csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv"))

Question 1-3: Create and plot a dataframe with stock of paintings for each month in the year

Data cleaning

#Select relevant columns and remove observations without dates
df1 <- df %>%
  select(DateAcquired, CuratorApproved, Department, Title)%>%
  filter(!is.na(DateAcquired) & DateAcquired!="")%>%
  arrange(DateAcquired)

#Correct a wrong entry in the data and rearrange
df1$DateAcquired[1:5] <- "2009-11-17"     #Note: the wrong entries show out on top
df1 <- df1 %>% arrange(DateAcquired)      #Rearrange data

#Mark class as dates
df1$DateAcquired <- as.Date(df1$DateAcquired) # Mark as date
df1 <- df1 %>% filter(!is.na(DateAcquired)) #Remove observations that are now NA (one is wrongly entered as "1941")

Creating the data frame

df2 <- df1 %>%

Map every observation to the ending date of that month

    mutate(month = ceiling_date(DateAcquired, unit = "month")) %>%
    group_by(as.factor(as.character(month)), CuratorApproved) %>%
    #Create new dataframe that counts all NEW acquisitions by month and curator status 
    summarise(count.month = n())%>%  
    ungroup()%>%
    # Add column that cumulates the total stock across time
    mutate(cum.count = cumsum(count.month))%>%
    # Add column that cumulates by curator status
    group_by(CuratorApproved)%>%
    mutate(cum.count.cur = cumsum(count.month))

Rename dataframe and set as class date

colnames(df2)[1] <- "month"
df2$month <- as.Date(df2$month)

Make a plot of the total stock

Base line plot of stock through time

p <-  ggplot(df2, aes(x=month, y=cum.count))+
  geom_line(aes(color="red"))+
  scale_x_date(breaks="10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by date")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))+
  guides(color=FALSE)
p

Base line plot of stock through time, by Curator Approval

p <-  ggplot(df2, aes(x=month, y=cum.count.cur, group=CuratorApproved))+
  geom_line(aes(color=CuratorApproved))+
  scale_x_date(breaks="10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by date and Curator Approval")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))+
  guides(color=guide_legend(reverse=TRUE, title="Approved \nby curator"))
p

Question 4-5: Plot by department

#Preparing dataframe (syntax and approach similar to question 1-3)
df3 <-  df1 %>%
  mutate(month = floor_date(DateAcquired, unit = "month")) %>%
  group_by(as.factor(as.character(month)), Department)%>%
  summarise(count.month = n())%>%
  ungroup()%>%
  mutate(cum.count = cumsum(count.month))%>%
  group_by(Department)%>%
  mutate(cum.count.dep = cumsum(count.month))

colnames(df3)[1] <- "month"
df3$month <- as.Date(df3$month)

#Plot line graph by department
p <-  ggplot(df3, aes(x=month, y=cum.count.dep, group=Department))+
  geom_line(aes(color=Department))+
  scale_x_date(breaks = "10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by department and date")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"), legend.position="bottom")
p

print("'Prints and Illustrated books' has had the largest increase.")

Question 6: List top 10 artists by number of paintings

#Summarize works by artist
df6 <-  df %>%
        filter(Artist!="") %>% #Remove rows without specified artist
        group_by(Artist) %>% #Group by artist
        summarise(n_works = n()) %>% #Count works by each artist
        ungroup()%>%
        arrange(desc((n_works))) #Arrange by most works
df6[1:10,]  #List top 10

7: List birth place of painters and color world map accordingly

#Data cleaning

  #Select relevant columns and remove observations without dates
  df7 <-  df %>%
    select(Artist, ArtistBio) %>% #Select relevant variables
    filter(Artist!="" & ArtistBio!="") %>% #Remove rows without specified artist
    #Filter to obtain only one row per artist
    group_by(Artist) %>% #Group by artist
    mutate( rank = 1:n() ) %>%  #Create a vector counting works by each artist  
    filter( rank == max(rank) ) %>% #Choose only the top observation (arbitrary method)
    ungroup() %>%
    arrange(ArtistBio) #Arrange ArtistBio

  # For some works, there are several artists from different countries.
  # This code will only list the country of the first artist
  # It will only list countries where there are at least two artists from

  #Remove everything after commas
  df7$ArtistBio2 <- tolower(df7$ArtistBio)
  df7$ArtistBio2 <- gsub(",.*", "" , df7$ArtistBio2)

  #Remove all parantheses
  df7$ArtistBio2 <- gsub("\\(", "" , df7$ArtistBio2)
  df7$ArtistBio2 <- gsub("\\)", "" , df7$ArtistBio2)

  #Remove errors in the data with "various" for multiple artists
  df7$ArtistBio2 <- gsub("various ", "" , df7$ArtistBio2)
  df7$ArtistBio2 <- gsub("various", "" , df7$ArtistBio2)

#Count artists by country to get simpler dataframe
  df7.1 <- df7 %>% 
    group_by(ArtistBio2) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count))
  df7.1

#Correct simplest mistakes for top countries, and delete remaining

  #Rename rows
  df7.1$ArtistBio2[25] <- "american"
  df7.1$ArtistBio2[52] <- "german"
  df7.1$ArtistBio2[55] <- "russian"
  df7.1$ArtistBio2[59] <- "italian"
  df7.1$ArtistBio2[63] <- "british"
  df7.1$ArtistBio2[70] <- "british"
  df7.1$ArtistBio2[74] <- "french"
  df7.1$ArtistBio2[79] <- "cuban"
  df7.1$ArtistBio2[80] <- "american"
  df7.1$ArtistBio2[90] <- "dutch"
  df7.1$ArtistBio2[92] <- "zimbabwean"

  #Rows to be deleted
  delete <- c(7, 71, 77)
  df7.1 <- df7.1[-delete, ]

  #Filter away rows that have too few observations (but many errors) and regroup
  df7.1 <-  df7.1 %>% 
            filter(count>=3) %>% 
            ungroup() %>% 
            group_by(ArtistBio2) %>%
            summarize(count = sum(as.numeric(count))) %>% 
            arrange(desc(count))
  names(df7.1)[1] <- "demonymic"

  #Succes rate
  paste("Succesfully isolated the artist origins of ", sum(df7.1$count)/nrow(df6)*100,"% of total artworks")

  #Get relation between country names and what people from there are called

    #Download relation table from wikipedia
    url <- "http://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations"
    tabs <- GET(url)
    tabs <- readHTMLTable(rawToChar(tabs$content), stringsAsFactors = F, header = TRUE)
    countries <- as.data.frame(tabs[1], header=TRUE)

    #Clean table as data.frame
    #Fix headers
    names(countries) <- c("country_name", "demonymic", "Colloquial", "NA")
    countries <- countries[-1, ]
    #Filter
    countries <- select(countries, country_name, demonymic)
    #Clean
    countries$demonymic <- tolower(countries$demonymic)
    #Remove Wiki-brackets
    countries$demonymic <- gsub("\\[.*\\]", "", x = countries$demonymic) #Remove Wiki-links
    countries$country_name <- gsub("\\[.*\\]", "", x = countries$country_name) #Remove Wiki-links
    countries[ ,3:5] <- str_split_fixed(countries$demonymic, ", ", 3) #Split several options into different rows

  #Join datasets
  df7.2 <- left_join(df7.1, countries, by=c("demonymic"="V3"))

  #Fix single countries that are not correctly matched
  df7.2$country_name[1] <- "USA"
  df7.2$country_name[27] <- "USA"
  df7.2$country_name[36] <- "South Korea"
  df7.2$country_name[38] <- "Yugoslavia"
  df7.2$country_name[40] <- "Scotland"
  df7.2$country_name[53] <- "New Zealand"
  df7.2$country_name[61] <- "Japan"
  df7.2$country_name[65] <- "Taiwan"
  df7.2$country_name[73] <- "Luxembourg"

  #Get world map coordinates
  map=map_data("world")
  names(map)=c("lon", "lat", "group", "order", "Country", "subregion")

  #Join datasets
  df7.3 <- left_join(map, df7.2, by=c("Country"="country_name"))

  #Plot world map
  p <- ggplot(df7.3, aes(x=lon, y=lat, group=group, fill=df7.3$count))+
    geom_polygon()+
    scale_fill_gradient("", trans="log10")+ 
    ggtitle("World map: MoMA artworks")+
    theme(plot.title = element_text(lineheight=.8, face="bold"))
  p

8: The five largest and five smallest paintings in MoMA's collection

  df8 <- df %>%
    select(Dimensions, Title)%>%
    filter(!is.na(Dimensions) & Dimensions!="")%>%
    arrange(Dimensions)

  ##Isolates the numbers from within the parenthesis, the dimensions in cms##
  df8$dim <- sub(".*?\\(([\\d]+\\.?[\\d]*)\\s[xX\xD7]\\s([\\d]+\\.?[\\d]*).*", 
                 "\\1 \\2", df8$Dimensions, perl = TRUE)

  ##Removes irregular observations and splits string in height and width##
  df8 <- filter(df8, grepl("[\\d]+\\.?[\\d]*\\s[\\d]+\\.?[\\d]*$", df8$dim, perl = TRUE))
  df8 <- filter(df8, grepl("^[\\d.]*\\s[\\d.]+$", df8$dim, perl = TRUE))
  df8$Height <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,1])
  df8$Width <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,2])

  ##Creates size-vector##
  df8$size <- df8$Height * df8$Width

  ##Prints the five largest and five smallest artworks##
  df8 <- df8 %>%
    group_by(size) %>%
    arrange(-size) %>%
    ungroup %>% 
    select(Title, Height, Width, size)

  #Five smallest artworks in MoMA
  head(df8, n=5)

  #Five largest artworks in MoMA
  df8 <- df8 %>% arrange(desc(size)) 
  head(df8, n=5)
sebastianbarfort commented 8 years ago

Very very assignment!

Great use of comments and really nice use of dplyr verbs (note there's also a rename function in the package).

Keep up the good work.

APPROVED