ASSIGNMENT 1: DATA MANIPULATION AND VISUALIZATION

Group 8

Hand-in, October 8 2015

Group Members:

Dennis Hansen

Susanne Sundgaard Hansen

Oskar Harmsen

Ann-Sofie Hansen

Fix international characters (only works on MAC)

Sys.setlocale("LC_ALL", "pt_PT.UTF-8")

Load packages

library("readr") library("lubridate") library("dplyr") library("ggplot2") library("XML") library("httr") library("stringr")

Load file

df = tbl_df(read.csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv"))

Question 1-3: Create and plot a dataframe with stock of paintings for each month in the year

Data cleaning

#Select relevant columns and remove observations without dates
df1 <- df %>%
  select(DateAcquired, CuratorApproved, Department, Title)%>%
  filter(!is.na(DateAcquired) & DateAcquired!="")%>%
  arrange(DateAcquired)

#Correct a wrong entry in the data and rearrange
df1$DateAcquired[1:5] <- "2009-11-17"     #Note: the wrong entries show out on top
df1 <- df1 %>% arrange(DateAcquired)      #Rearrange data

#Mark class as dates
df1$DateAcquired <- as.Date(df1$DateAcquired) # Mark as date
df1 <- df1 %>% filter(!is.na(DateAcquired)) #Remove observations that are now NA (one is wrongly entered as "1941")

Creating the data frame

df2 <- df1 %>%

Map every observation to the ending date of that month

    mutate(month = ceiling_date(DateAcquired, unit = "month")) %>%
    group_by(as.factor(as.character(month)), CuratorApproved) %>%
    #Create new dataframe that counts all NEW acquisitions by month and curator status 
    summarise(count.month = n())%>%  
    ungroup()%>%
    # Add column that cumulates the total stock across time
    mutate(cum.count = cumsum(count.month))%>%
    # Add column that cumulates by curator status
    group_by(CuratorApproved)%>%
    mutate(cum.count.cur = cumsum(count.month))

Rename dataframe and set as class date

colnames(df2)[1] <- "month"
df2$month <- as.Date(df2$month)

Make a plot of the total stock

Base line plot of stock through time

p <-  ggplot(df2, aes(x=month, y=cum.count))+
  geom_line(aes(color="red"))+
  scale_x_date(breaks="10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by date")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))+
  guides(color=FALSE)
p

Base line plot of stock through time, by Curator Approval

p <-  ggplot(df2, aes(x=month, y=cum.count.cur, group=CuratorApproved))+
  geom_line(aes(color=CuratorApproved))+
  scale_x_date(breaks="10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by date and Curator Approval")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"))+
  guides(color=guide_legend(reverse=TRUE, title="Approved \nby curator"))
p

Question 4-5: Plot by department

#Preparing dataframe (syntax and approach similar to question 1-3)
df3 <-  df1 %>%
  mutate(month = floor_date(DateAcquired, unit = "month")) %>%
  group_by(as.factor(as.character(month)), Department)%>%
  summarise(count.month = n())%>%
  ungroup()%>%
  mutate(cum.count = cumsum(count.month))%>%
  group_by(Department)%>%
  mutate(cum.count.dep = cumsum(count.month))

colnames(df3)[1] <- "month"
df3$month <- as.Date(df3$month)

#Plot line graph by department
p <-  ggplot(df3, aes(x=month, y=cum.count.dep, group=Department))+
  geom_line(aes(color=Department))+
  scale_x_date(breaks = "10 year")+
  ggtitle("Cumulative Acquistions, MoMA, by department and date")+
  xlab("Date")+
  ylab("Total number of artworks acquired, to date")+
  theme(plot.title = element_text(lineheight=.8, face="bold"), legend.position="bottom")
p

print("'Prints and Illustrated books' has had the largest increase.")

Question 6: List top 10 artists by number of paintings

#Summarize works by artist
df6 <-  df %>%
        filter(Artist!="") %>% #Remove rows without specified artist
        group_by(Artist) %>% #Group by artist
        summarise(n_works = n()) %>% #Count works by each artist
        ungroup()%>%
        arrange(desc((n_works))) #Arrange by most works
df6[1:10,]  #List top 10

7: List birth place of painters and color world map accordingly

#Data cleaning

  #Select relevant columns and remove observations without dates
  df7 <-  df %>%
    select(Artist, ArtistBio) %>% #Select relevant variables
    filter(Artist!="" & ArtistBio!="") %>% #Remove rows without specified artist
    #Filter to obtain only one row per artist
    group_by(Artist) %>% #Group by artist
    mutate( rank = 1:n() ) %>%  #Create a vector counting works by each artist  
    filter( rank == max(rank) ) %>% #Choose only the top observation (arbitrary method)
    ungroup() %>%
    arrange(ArtistBio) #Arrange ArtistBio

  # For some works, there are several artists from different countries.
  # This code will only list the country of the first artist
  # It will only list countries where there are at least two artists from

  #Remove everything after commas
  df7$ArtistBio2 <- tolower(df7$ArtistBio)
  df7$ArtistBio2 <- gsub(",.*", "" , df7$ArtistBio2)

  #Remove all parantheses
  df7$ArtistBio2 <- gsub("\\(", "" , df7$ArtistBio2)
  df7$ArtistBio2 <- gsub("\\)", "" , df7$ArtistBio2)

  #Remove errors in the data with "various" for multiple artists
  df7$ArtistBio2 <- gsub("various ", "" , df7$ArtistBio2)
  df7$ArtistBio2 <- gsub("various", "" , df7$ArtistBio2)

#Count artists by country to get simpler dataframe
  df7.1 <- df7 %>% 
    group_by(ArtistBio2) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count))
  df7.1

#Correct simplest mistakes for top countries, and delete remaining

  #Rename rows
  df7.1$ArtistBio2[25] <- "american"
  df7.1$ArtistBio2[52] <- "german"
  df7.1$ArtistBio2[55] <- "russian"
  df7.1$ArtistBio2[59] <- "italian"
  df7.1$ArtistBio2[63] <- "british"
  df7.1$ArtistBio2[70] <- "british"
  df7.1$ArtistBio2[74] <- "french"
  df7.1$ArtistBio2[79] <- "cuban"
  df7.1$ArtistBio2[80] <- "american"
  df7.1$ArtistBio2[90] <- "dutch"
  df7.1$ArtistBio2[92] <- "zimbabwean"

  #Rows to be deleted
  delete <- c(7, 71, 77)
  df7.1 <- df7.1[-delete, ]

  #Filter away rows that have too few observations (but many errors) and regroup
  df7.1 <-  df7.1 %>% 
            filter(count>=3) %>% 
            ungroup() %>% 
            group_by(ArtistBio2) %>%
            summarize(count = sum(as.numeric(count))) %>% 
            arrange(desc(count))
  names(df7.1)[1] <- "demonymic"

  #Succes rate
  paste("Succesfully isolated the artist origins of ", sum(df7.1$count)/nrow(df6)*100,"% of total artworks")

  #Get relation between country names and what people from there are called

    #Download relation table from wikipedia
    url <- "http://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations"
    tabs <- GET(url)
    tabs <- readHTMLTable(rawToChar(tabs$content), stringsAsFactors = F, header = TRUE)
    countries <- as.data.frame(tabs[1], header=TRUE)

    #Clean table as data.frame
    #Fix headers
    names(countries) <- c("country_name", "demonymic", "Colloquial", "NA")
    countries <- countries[-1, ]
    #Filter
    countries <- select(countries, country_name, demonymic)
    #Clean
    countries$demonymic <- tolower(countries$demonymic)
    #Remove Wiki-brackets
    countries$demonymic <- gsub("\\[.*\\]", "", x = countries$demonymic) #Remove Wiki-links
    countries$country_name <- gsub("\\[.*\\]", "", x = countries$country_name) #Remove Wiki-links
    countries[ ,3:5] <- str_split_fixed(countries$demonymic, ", ", 3) #Split several options into different rows

  #Join datasets
  df7.2 <- left_join(df7.1, countries, by=c("demonymic"="V3"))

  #Fix single countries that are not correctly matched
  df7.2$country_name[1] <- "USA"
  df7.2$country_name[27] <- "USA"
  df7.2$country_name[36] <- "South Korea"
  df7.2$country_name[38] <- "Yugoslavia"
  df7.2$country_name[40] <- "Scotland"
  df7.2$country_name[53] <- "New Zealand"
  df7.2$country_name[61] <- "Japan"
  df7.2$country_name[65] <- "Taiwan"
  df7.2$country_name[73] <- "Luxembourg"

  #Get world map coordinates
  map=map_data("world")
  names(map)=c("lon", "lat", "group", "order", "Country", "subregion")

  #Join datasets
  df7.3 <- left_join(map, df7.2, by=c("Country"="country_name"))

  #Plot world map
  p <- ggplot(df7.3, aes(x=lon, y=lat, group=group, fill=df7.3$count))+
    geom_polygon()+
    scale_fill_gradient("", trans="log10")+ 
    ggtitle("World map: MoMA artworks")+
    theme(plot.title = element_text(lineheight=.8, face="bold"))
  p

8: The five largest and five smallest paintings in MoMA's collection

  df8 <- df %>%
    select(Dimensions, Title)%>%
    filter(!is.na(Dimensions) & Dimensions!="")%>%
    arrange(Dimensions)

  ##Isolates the numbers from within the parenthesis, the dimensions in cms##
  df8$dim <- sub(".*?\\(([\\d]+\\.?[\\d]*)\\s[xX\xD7]\\s([\\d]+\\.?[\\d]*).*", 
                 "\\1 \\2", df8$Dimensions, perl = TRUE)

  ##Removes irregular observations and splits string in height and width##
  df8 <- filter(df8, grepl("[\\d]+\\.?[\\d]*\\s[\\d]+\\.?[\\d]*$", df8$dim, perl = TRUE))
  df8 <- filter(df8, grepl("^[\\d.]*\\s[\\d.]+$", df8$dim, perl = TRUE))
  df8$Height <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,1])
  df8$Width <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,2])

  ##Creates size-vector##
  df8$size <- df8$Height * df8$Width

  ##Prints the five largest and five smallest artworks##
  df8 <- df8 %>%
    group_by(size) %>%
    arrange(-size) %>%
    ungroup %>% 
    select(Title, Height, Width, size)

  #Five smallest artworks in MoMA
  head(df8, n=5)

  #Five largest artworks in MoMA
  df8 <- df8 %>% arrange(desc(size)) 
  head(df8, n=5)

sebastianbarfort / sds

Group 8: Assignment 1 #22

ASSIGNMENT 1: DATA MANIPULATION AND VISUALIZATION

Group 8

Hand-in, October 8 2015

Group Members:

Dennis Hansen

Susanne Sundgaard Hansen

Oskar Harmsen

Ann-Sofie Hansen

Fix international characters (only works on MAC)

Load packages

Load file

Question 1-3: Create and plot a dataframe with stock of paintings for each month in the year

Data cleaning

Creating the data frame

Map every observation to the ending date of that month

Rename dataframe and set as class date

Make a plot of the total stock

Base line plot of stock through time

Base line plot of stock through time, by Curator Approval

Question 4-5: Plot by department

Question 6: List top 10 artists by number of paintings

7: List birth place of painters and color world map accordingly

8: The five largest and five smallest paintings in MoMA's collection