Question 1-3: Create and plot a dataframe with stock of paintings for each month in the year
Data cleaning
#Select relevant columns and remove observations without dates
df1 <- df %>%
select(DateAcquired, CuratorApproved, Department, Title)%>%
filter(!is.na(DateAcquired) & DateAcquired!="")%>%
arrange(DateAcquired)
#Correct a wrong entry in the data and rearrange
df1$DateAcquired[1:5] <- "2009-11-17" #Note: the wrong entries show out on top
df1 <- df1 %>% arrange(DateAcquired) #Rearrange data
#Mark class as dates
df1$DateAcquired <- as.Date(df1$DateAcquired) # Mark as date
df1 <- df1 %>% filter(!is.na(DateAcquired)) #Remove observations that are now NA (one is wrongly entered as "1941")
Creating the data frame
df2 <- df1 %>%
Map every observation to the ending date of that month
mutate(month = ceiling_date(DateAcquired, unit = "month")) %>%
group_by(as.factor(as.character(month)), CuratorApproved) %>%
#Create new dataframe that counts all NEW acquisitions by month and curator status
summarise(count.month = n())%>%
ungroup()%>%
# Add column that cumulates the total stock across time
mutate(cum.count = cumsum(count.month))%>%
# Add column that cumulates by curator status
group_by(CuratorApproved)%>%
mutate(cum.count.cur = cumsum(count.month))
p <- ggplot(df2, aes(x=month, y=cum.count))+
geom_line(aes(color="red"))+
scale_x_date(breaks="10 year")+
ggtitle("Cumulative Acquistions, MoMA, by date")+
xlab("Date")+
ylab("Total number of artworks acquired, to date")+
theme(plot.title = element_text(lineheight=.8, face="bold"))+
guides(color=FALSE)
p
Base line plot of stock through time, by Curator Approval
p <- ggplot(df2, aes(x=month, y=cum.count.cur, group=CuratorApproved))+
geom_line(aes(color=CuratorApproved))+
scale_x_date(breaks="10 year")+
ggtitle("Cumulative Acquistions, MoMA, by date and Curator Approval")+
xlab("Date")+
ylab("Total number of artworks acquired, to date")+
theme(plot.title = element_text(lineheight=.8, face="bold"))+
guides(color=guide_legend(reverse=TRUE, title="Approved \nby curator"))
p
Question 4-5: Plot by department
#Preparing dataframe (syntax and approach similar to question 1-3)
df3 <- df1 %>%
mutate(month = floor_date(DateAcquired, unit = "month")) %>%
group_by(as.factor(as.character(month)), Department)%>%
summarise(count.month = n())%>%
ungroup()%>%
mutate(cum.count = cumsum(count.month))%>%
group_by(Department)%>%
mutate(cum.count.dep = cumsum(count.month))
colnames(df3)[1] <- "month"
df3$month <- as.Date(df3$month)
#Plot line graph by department
p <- ggplot(df3, aes(x=month, y=cum.count.dep, group=Department))+
geom_line(aes(color=Department))+
scale_x_date(breaks = "10 year")+
ggtitle("Cumulative Acquistions, MoMA, by department and date")+
xlab("Date")+
ylab("Total number of artworks acquired, to date")+
theme(plot.title = element_text(lineheight=.8, face="bold"), legend.position="bottom")
p
print("'Prints and Illustrated books' has had the largest increase.")
Question 6: List top 10 artists by number of paintings
#Summarize works by artist
df6 <- df %>%
filter(Artist!="") %>% #Remove rows without specified artist
group_by(Artist) %>% #Group by artist
summarise(n_works = n()) %>% #Count works by each artist
ungroup()%>%
arrange(desc((n_works))) #Arrange by most works
df6[1:10,] #List top 10
7: List birth place of painters and color world map accordingly
#Data cleaning
#Select relevant columns and remove observations without dates
df7 <- df %>%
select(Artist, ArtistBio) %>% #Select relevant variables
filter(Artist!="" & ArtistBio!="") %>% #Remove rows without specified artist
#Filter to obtain only one row per artist
group_by(Artist) %>% #Group by artist
mutate( rank = 1:n() ) %>% #Create a vector counting works by each artist
filter( rank == max(rank) ) %>% #Choose only the top observation (arbitrary method)
ungroup() %>%
arrange(ArtistBio) #Arrange ArtistBio
# For some works, there are several artists from different countries.
# This code will only list the country of the first artist
# It will only list countries where there are at least two artists from
#Remove everything after commas
df7$ArtistBio2 <- tolower(df7$ArtistBio)
df7$ArtistBio2 <- gsub(",.*", "" , df7$ArtistBio2)
#Remove all parantheses
df7$ArtistBio2 <- gsub("\\(", "" , df7$ArtistBio2)
df7$ArtistBio2 <- gsub("\\)", "" , df7$ArtistBio2)
#Remove errors in the data with "various" for multiple artists
df7$ArtistBio2 <- gsub("various ", "" , df7$ArtistBio2)
df7$ArtistBio2 <- gsub("various", "" , df7$ArtistBio2)
#Count artists by country to get simpler dataframe
df7.1 <- df7 %>%
group_by(ArtistBio2) %>%
summarize(count = n()) %>%
arrange(desc(count))
df7.1
#Correct simplest mistakes for top countries, and delete remaining
#Rename rows
df7.1$ArtistBio2[25] <- "american"
df7.1$ArtistBio2[52] <- "german"
df7.1$ArtistBio2[55] <- "russian"
df7.1$ArtistBio2[59] <- "italian"
df7.1$ArtistBio2[63] <- "british"
df7.1$ArtistBio2[70] <- "british"
df7.1$ArtistBio2[74] <- "french"
df7.1$ArtistBio2[79] <- "cuban"
df7.1$ArtistBio2[80] <- "american"
df7.1$ArtistBio2[90] <- "dutch"
df7.1$ArtistBio2[92] <- "zimbabwean"
#Rows to be deleted
delete <- c(7, 71, 77)
df7.1 <- df7.1[-delete, ]
#Filter away rows that have too few observations (but many errors) and regroup
df7.1 <- df7.1 %>%
filter(count>=3) %>%
ungroup() %>%
group_by(ArtistBio2) %>%
summarize(count = sum(as.numeric(count))) %>%
arrange(desc(count))
names(df7.1)[1] <- "demonymic"
#Succes rate
paste("Succesfully isolated the artist origins of ", sum(df7.1$count)/nrow(df6)*100,"% of total artworks")
#Get relation between country names and what people from there are called
#Download relation table from wikipedia
url <- "http://en.wikipedia.org/wiki/List_of_adjectival_and_demonymic_forms_for_countries_and_nations"
tabs <- GET(url)
tabs <- readHTMLTable(rawToChar(tabs$content), stringsAsFactors = F, header = TRUE)
countries <- as.data.frame(tabs[1], header=TRUE)
#Clean table as data.frame
#Fix headers
names(countries) <- c("country_name", "demonymic", "Colloquial", "NA")
countries <- countries[-1, ]
#Filter
countries <- select(countries, country_name, demonymic)
#Clean
countries$demonymic <- tolower(countries$demonymic)
#Remove Wiki-brackets
countries$demonymic <- gsub("\\[.*\\]", "", x = countries$demonymic) #Remove Wiki-links
countries$country_name <- gsub("\\[.*\\]", "", x = countries$country_name) #Remove Wiki-links
countries[ ,3:5] <- str_split_fixed(countries$demonymic, ", ", 3) #Split several options into different rows
#Join datasets
df7.2 <- left_join(df7.1, countries, by=c("demonymic"="V3"))
#Fix single countries that are not correctly matched
df7.2$country_name[1] <- "USA"
df7.2$country_name[27] <- "USA"
df7.2$country_name[36] <- "South Korea"
df7.2$country_name[38] <- "Yugoslavia"
df7.2$country_name[40] <- "Scotland"
df7.2$country_name[53] <- "New Zealand"
df7.2$country_name[61] <- "Japan"
df7.2$country_name[65] <- "Taiwan"
df7.2$country_name[73] <- "Luxembourg"
#Get world map coordinates
map=map_data("world")
names(map)=c("lon", "lat", "group", "order", "Country", "subregion")
#Join datasets
df7.3 <- left_join(map, df7.2, by=c("Country"="country_name"))
#Plot world map
p <- ggplot(df7.3, aes(x=lon, y=lat, group=group, fill=df7.3$count))+
geom_polygon()+
scale_fill_gradient("", trans="log10")+
ggtitle("World map: MoMA artworks")+
theme(plot.title = element_text(lineheight=.8, face="bold"))
p
8: The five largest and five smallest paintings in MoMA's collection
df8 <- df %>%
select(Dimensions, Title)%>%
filter(!is.na(Dimensions) & Dimensions!="")%>%
arrange(Dimensions)
##Isolates the numbers from within the parenthesis, the dimensions in cms##
df8$dim <- sub(".*?\\(([\\d]+\\.?[\\d]*)\\s[xX\xD7]\\s([\\d]+\\.?[\\d]*).*",
"\\1 \\2", df8$Dimensions, perl = TRUE)
##Removes irregular observations and splits string in height and width##
df8 <- filter(df8, grepl("[\\d]+\\.?[\\d]*\\s[\\d]+\\.?[\\d]*$", df8$dim, perl = TRUE))
df8 <- filter(df8, grepl("^[\\d.]*\\s[\\d.]+$", df8$dim, perl = TRUE))
df8$Height <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,1])
df8$Width <- as.numeric(str_split_fixed(df8$dim, " ", 2)[,2])
##Creates size-vector##
df8$size <- df8$Height * df8$Width
##Prints the five largest and five smallest artworks##
df8 <- df8 %>%
group_by(size) %>%
arrange(-size) %>%
ungroup %>%
select(Title, Height, Width, size)
#Five smallest artworks in MoMA
head(df8, n=5)
#Five largest artworks in MoMA
df8 <- df8 %>% arrange(desc(size))
head(df8, n=5)
ASSIGNMENT 1: DATA MANIPULATION AND VISUALIZATION
Group 8
Hand-in, October 8 2015
Group Members:
Dennis Hansen
Susanne Sundgaard Hansen
Oskar Harmsen
Ann-Sofie Hansen
Fix international characters (only works on MAC)
Sys.setlocale("LC_ALL", "pt_PT.UTF-8")
Load packages
library("readr") library("lubridate") library("dplyr") library("ggplot2") library("XML") library("httr") library("stringr")
Load file
df = tbl_df(read.csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv"))
Question 1-3: Create and plot a dataframe with stock of paintings for each month in the year
Data cleaning
Creating the data frame
df2 <- df1 %>%
Map every observation to the ending date of that month
Rename dataframe and set as class date
Make a plot of the total stock
Base line plot of stock through time
Base line plot of stock through time, by Curator Approval
Question 4-5: Plot by department
Question 6: List top 10 artists by number of paintings
7: List birth place of painters and color world map accordingly
8: The five largest and five smallest paintings in MoMA's collection