ggplot(monthly.dat, aes(x = month, y = No.Paintings)) +
geom_line(colour = "red") + # use lines to gauge ts fluctuations. Colour red
theme_minimal() +
ggtitle("Number of new acquiries \n by month") +
labs(x = "Month", y = "New acquiries") +
scale_x_continuous(breaks = 1:12)
stock for each month every year
df$YearMonth <- paste(df$year, df$month, sep = "." ) # bind together year and month
df$YearMonth <- as.numeric(df$YearMonth) # make numeric
ggplot(YM.dat, aes(x = YearMonth, y = No.Paintings)) +
geom_line() +
theme_minimal() +
ggtitle("Number of new acquiries \n by month") +
labs(x = "Month", y = "New acquiries")
ggplot(YM_approved.dat, aes(x = YearMonth, y = No.Paintings, colour = CuratorApproved)) +
geom_line() +
theme_minimal() +
ggtitle("Number of new acquiries \n by month") +
labs(x = "Month", y = "New acquiries")
ggplot(department.dat, aes(y = No.Paintings, x = reorder(Department, No.Paintings))) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20)) +
labs(y = "New acquiries", x = "Department") +
ggtitle(expression(atop("Number of new acquiries \n by department", atop(italic("Departments in decending order"), "")))) +
coord_flip()
across all years, Prints & Illustrated books has recevied most new acquiries
plot cumulative changes over time to gauge which department has largest increase ins stock
ggplot(cum.dep.dat, aes(y = cumulative, x = YearMonth, colour = Department)) +
geom_line() +
theme_minimal() +
labs(y = "Cumulative count \n of new acquiries", x = "Time") +
ggtitle("Cumulative count of new acquiries \n by department")
Prints & Illustrated books also has the largest increase in number of paintings
library("readr"); library(dplyr); library(stringr); library(ggplot2); library(countrycode); library(maps); library(knitr)
df <- read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv")
QUESTION 1 ----------------------------------------------------------------------
df with stock of paintings for each month in the year
dates <- str_split(df$DateAcquired, pattern = "-") # split date of acquiry to isolate month dates <- do.call(rbind.data.frame, dates) df <- bind_cols(df, dates) names(df)[15:17] <- c("year", "month", "day") df$month <- gsub(pattern = 0, replacement = "", df$month) # remove zero before months df$month <- as.numeric(df$month)
df with stock of paintings each month (across all years)
monthly.dat <- df %>% group_by(month) %>% summarise(length(ObjectID)) names(monthly.dat)[2] <- "No.Paintings"
QUESTION 2 ----------------------------------------------------------------------
plot the number of new acquiries pr. month
ggplot(monthly.dat, aes(x = month, y = No.Paintings)) + geom_line(colour = "red") + # use lines to gauge ts fluctuations. Colour red theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries") + scale_x_continuous(breaks = 1:12)
stock for each month every year
df$YearMonth <- paste(df$year, df$month, sep = "." ) # bind together year and month df$YearMonth <- as.numeric(df$YearMonth) # make numeric
count of paintings by month in each year
YM.dat <- df %>% group_by(YearMonth) %>% summarise(length(ObjectID)) names(YM.dat)[2] <- "No.Paintings"
ggplot(YM.dat, aes(x = YearMonth, y = No.Paintings)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries")
QUESTION 3 ---------------------------------------------------------------------------------------------------------
stock of paintings that are curator vs. non-curator approved
first the stock in each month summed across all years
MonthCur.dat <- df %>% group_by(month, CuratorApproved) %>% summarise(length(ObjectID)) names(MonthCur.dat)[3] <- "No.Paintings"
ggplot(MonthCur.dat, aes(x = month, y = No.Paintings, colour = CuratorApproved)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month and curator approval") + labs(x = "Month", y = "New acquiries") + scale_x_continuous(breaks = 1:12) + scale_fill_discrete(name = "CuratorApproved", breaks = c("N", "Y"), labels = c("No", "Yes"))
now the stock in each month across all years
YM_approved.dat <- df %>% group_by(YearMonth, CuratorApproved) %>% summarise(length(ObjectID)) names(YM_approved.dat)[3] <- "No.Paintings"
ggplot(YM_approved.dat, aes(x = YearMonth, y = No.Paintings, colour = CuratorApproved)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries")
QUESTION 4 ----------------------------------------------------------------------------------------------------------
total stock of paintings by department to gauge which is largest
department.dat <- df %>% group_by(Department) %>% summarise(length(ObjectID)) names(department.dat)[2] <- "No.Paintings"
cumulative stock of paintings by department to gauge largest increase
cum.dep.dat <- df %>% count(Department, year, month) %>% group_by(Department)
cum.dep.dat$cumulative <- order_by(cum.dep.dat$Department, cumsum(cum.dep.dat$n))
paste year and month together
cum.dep.dat$YearMonth <- paste(cum.dep.dat$year, cum.dep.dat$month, sep = "." ) # bind together year and month cum.dep.dat$YearMonth <- as.numeric(cum.dep.dat$YearMonth)
QUESTION 5 ------------------------------------------------------------------------------------------------
barplot of the stock of paintings by department
ggplot(department.dat, aes(y = No.Paintings, x = reorder(Department, No.Paintings))) + geom_bar(stat = "identity") + theme_minimal() + theme(axis.text.x = element_text(angle = 20)) + labs(y = "New acquiries", x = "Department") + ggtitle(expression(atop("Number of new acquiries \n by department", atop(italic("Departments in decending order"), "")))) + coord_flip()
across all years, Prints & Illustrated books has recevied most new acquiries
plot cumulative changes over time to gauge which department has largest increase ins stock
ggplot(cum.dep.dat, aes(y = cumulative, x = YearMonth, colour = Department)) + geom_line() + theme_minimal() + labs(y = "Cumulative count \n of new acquiries", x = "Time") + ggtitle("Cumulative count of new acquiries \n by department")
Prints & Illustrated books also has the largest increase in number of paintings
QUESTION 6 --------------------------------------------------------------------------------------------------------
count of paintings pr. artist
Artist.dat <- df %>% group_by(Artist) %>% summarise(length(ObjectID)) names(Artist.dat)[2] <- "No.Paintings"
Artist.dat <- Artist.dat[order(-Artist.dat$No.Paintings), ]
Artist.dat$Artist[6] <- NA # deal with missing names
artist.plot <- na.omit(Artist.dat[1:11, ])
ggplot(artist.plot, aes(x = reorder(Artist, No.Paintings), y = No.Paintings)) + geom_bar(stat = "identity") + theme_minimal() + coord_flip() + labs(x = "Artist", y = "Number of paintings") + ggtitle(expression(atop("Paintings in stock by artist", atop(italic("Artists in decending order"), ""))))
maybe "Unknown photographer" isn't an artist name - plot with those removed
Artist.dat$Artist[4] <- NA
artist.plot2 <- na.omit(Artist.dat[1:11, ])
ggplot(artist.plot2, aes(x = reorder(Artist, No.Paintings), y = No.Paintings)) + geom_bar(stat = "identity") + theme_minimal() + coord_flip() + labs(x = "Artist", y = "Number of paintings") + ggtitle(expression(atop("Paintings in stock by artist", atop(italic("Artists in decending order"), ""))))
QUESTION 7 -------------------------------------------------------------------------------------------------------
first, extract birthplace of artist
convert to lower case
df$country <- tolower(df$ArtistBio)
remove all numbers
df$country <- gsub(x = df$country, pattern = "[^a-z]", replacement = " ")
extract everyting after 'born' to get country of birth - not nationality
df$country2 <- strextract(string = df$country, "born .[a-z]") df$country2 <- sub(x = df$country2, pattern = "born ", replacement = "\1") df$country2 <- gsub(x = df$country2, '([a-z]+) ._', '\1') # get first country mentioned after 'born'
for all artist where nationality = country of birth
df$country1 <- strextract(string = df$country, pattern = "(.[a-z, ])") df$country1 <- gsub(x = df$country1, '([a-z]+) ._', '\1')
if country of birth isn't different from nationality save nationality, otherwise country of birth
df$country3 <- ifelse(is.na(df$country2), df$country1, df$country2) df$country3 <- gsub(df$country3, pattern = " ", replacement = "") # remove whitespace
convert some nationalities to country names
df$country4 <- countrycode(df$country3, "country.name", "country.name") df$country4 <- ifelse(is.na(df$country4), df$country3, df$country4)
countries <- as.data.frame(unique(df$country4))
rest of conversion done manually. There is probably a better way - this is messy
df$country4[df$country4== "american"] <- "USA" df$country4[df$country4== "german"] <- "Germany" df$country4[df$country4== "austrian"] <- "Austria" df$country4[df$country4== "vienna"] <- "Austria" df$country4[df$country4== "french"] <- "France" df$country4[df$country4== "bolivian"] <- "Bolivia" df$country4[df$country4== "swiss"] <- "Switzerland" df$country4[df$country4== "spanish"] <- "Spain" df$country4[df$country4== "catalan"] <- "Spain" df$country4[df$country4== "dutch"] <- "Netherlands" df$country4[df$country4== "russian"] <- "Russia" df$country4[df$country4== "italian"] <- "Italy" df$country4[df$country4== "british"] <- "UK" df$country4[df$country4== "swedish"] <- "Sweden" df$country4[df$country4== "mexican"] <- "Mexico" df$country4[df$country4== "polish"] <- "Poland" df$country4[df$country4== "ukrainian"] <- "Ukraine" df$country4[df$country4== "uruguayan"] <- "Uruguay" df$country4[df$country4== "venezuelan"] <- "Venezuela" df$country4[df$country4== "indian"] <- "India" df$country4[df$country4== "vietnamese"] <- "Vietnam" df$country4[df$country4== "zimbabwean"] <- "Zimbabwe" df$country4[df$country4== "tunisian"] <- "Tunisia" df$country4[df$country4== "turkish"] <- "Turkey" df$country4[df$country4== "brazilian"] <- "Brazil" df$country4[df$country4== "japanese"] <- "Japan" df$country4[df$country4== "chinese"] <- "China" df$country4[df$country4== "belgian"] <- "Belgium" df$country4[df$country4== "argentine"] <- "Argentine" df$country4[df$country4== "alsace"] <- "France" df$country4[df$country4== "chilean"] <- "Chile" df$country4[df$country4== "cuban"] <- "Cuba" df$country4[df$country4== "danish"] <- "Denmark" df$country4[df$country4== "guatemalan"] <- "Guatemala" df$country4[df$country4== "colombian"] <- "Colombia" df$country4[df$country4== "croatian"] <- "Croatia" df$country4[df$country4== "egyptian"] <- "Egypt" df$country4[df$country4== "finnish"] <- "Finland" df$country4[df$country4== "greek"] <- "Grecce" df$country4[df$country4== "haitian"] <- "Haiti" df$country4[df$country4== "hungarian"] <- "Hungary" df$country4[df$country4== "iranian"] <- "Iran" df$country4[df$country4== "latvian"] <- "Latvia" df$country4[df$country4== "pakistani"] <- "Pakistan" df$country4[df$country4== "peruvian"] <- "Peru" df$country4[df$country4== "portuguese"] <- "Portugal" df$country4[df$country4== "romanian"] <- "Romania" df$country4[df$country4== "scottish"] <- "Scotland" df$country4[df$country4== "yugoslav"] <- "Yugoslavia" df$country4[df$country4== "canadian"] <- "Canada" df$country4[df$country4== "england"] <- "UK" df$country4[df$country4== "United States"] <- "USA" df$country4[df$country4== "maroccan"] <- "Marocco" df$country4[df$country4== "english"] <- "UK" df$country4[df$country4== "colombian"] <- "Colombia" df$country4[df$country4== "norwegian"] <- "Norway" df$country4[df$country4== "czech"] <- "Czech Republic" df$country4[df$country4== "amsterdam"] <- "Netherlands" df$country4[df$country4== "moravia"] <- "Czech Republic" df$country4[df$country4== "silesia"] <- "Germany" df$country4[df$country4== "belorussia"] <- "Belarus" df$country4[df$country4== "puerto"] <- "Puerto Rico" df$country4[df$country4== "finish"] <- "Finland" df$country4[df$country4== "transylvania"] <- "Romania" df$country4[df$country4== "irish"] <- "Ireland" df$country4[df$country4== "costa"] <- "Costa Rica" df$country4[df$country4== "lituania"] <- "Lithuania" df$country4[df$country4== "america"] <- "USA" df$country4[df$country4== "malian"] <- "Mali" df$country4[df$country4== "wales"] <- "UK" df$country4[df$country4== "hawaii"] <- "USA" df$country4[df$country4== "thai"] <- "Thailand" df$country4[df$country4== "tunis"] <- "Tunisia" df$country4[df$country4== "guyanese"] <- "Guyana" df$country4[df$country4== "constantinople"] <- "Turkey" df$country4[df$country4== "holland"] <- "Netherlands" df$country4[df$country4== "bahamian"] <- "Bahamas" df$country4[df$country4== "hong"] <- "Hong Kong" df$country4[df$country4== "lebanese"] <- "Lebanon" df$country4[df$country4== "ivorian"] <- "Ivory Coast" df$country4[df$country4== "jackson"] <- "USA" df$country4[df$country4== "nowegian"] <- "Norway" df$country4[df$country4== "barcelona"] <- "Spain" df$country4[df$country4== "filipino"] <- "Philippines" df$country4[df$country4== "ukranian"] <- "Ukraine" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "zagreb"] <- "Croatia" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "saudi"] <- "Saudi Arabia" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "mozambican"] <- "Mozambique" df$country4[df$country4== "scotland"] <- "UK" df$country4[df$country4== "moroccan"] <- "Marocco" df$country4[df$country4== "columbian"] <- "Colombia" df$country4[df$country4== "Russian Federation"] <- "USSR"
get country level number of paintings
country.dat <- df %>% group_by(country4) %>% summarise(length(ObjectID)) names(country.dat)[2] <- "No.Paintings"
extract long, lat and country name
GIS.data <- map_data("world") names(GIS.data) <- c("long", "lat", "group", "order", "country4", "subregion")
merge painting data with GIS
country.paintings <- left_join(country.dat, GIS.data)
plot -- heavily skewed distribution, can't distinguish between many countries
ggplot(country.paintings, aes(x = long, y = lat, group=group)) + geom_polygon(aes(fill = No.Paintings)) + expand_limits() + theme_minimal()
use natural log of paintings -- comparison across countries are difficult
ggplot(country.paintings, aes(x = long, y = lat, group=group)) + geom_polygon(aes(fill = log(No.Paintings))) + expand_limits() + theme_minimal()
QUESTION 8 ------------------------------------------------------------------------------------------------------
extract dimensions in cm's
df$dimCM<- sub(x = df$Dimensions, pattern=".((.)).*", replacement="\1")
extract numbers and digit seperator
df$dimCM <- gsub(x = df$dimCM, pattern = "[^0-9, .]", replacement = "")
extract first number -- width
df$width <- str_extract(string = df$dimCM, "[0-9]{0,3}[.]{0,1}[0-9]{1,3}") df$width <- as.numeric(df$width)
delete width to isolate height
df$height <- sub(x = df$dimCM, pattern ="[0-9]{0,3}[.]{0,1}[0-9]{1,3}", replacement = "")
extract height
df$height <- str_extract(string = df$height, "[0-9]{0,3}[.]{0,3}[0-9]{1,3}") df$height <- as.numeric(df$height)
compute area
df$area <- df$width*df$height
get five largest
LargeSmall <- as.data.frame(df[order(-df$area), ]) LargeSmall <- LargeSmall[1:5, ]
get five smallest
Smallest5 <- as.data.frame(df[order(df$area), ]) Smallest5$area[1:109] <- NA #remove DVD's and other non-paintings Smallest5 <- as.data.frame(Smallest5[order(Smallest5$area), ]) Smallest5$area[1] <- Smallest5$area[1]/10 # correct painting measured in mm's LargeSmall[6:10, ] <- Smallest5[1:5, ]
names(LargeSmall)[1] <- "Title"
group indicator for largest or smallest
LargeSmall$dummy[1:5] <- "5 Largest" LargeSmall$dummy[6:10] <- "5 Smallest"
plot largest and smallest respectively
ggplot(LargeSmall, aes(x = reorder(Title, area), y = area)) + geom_bar(stat = "identity") + facet_wrap(~ dummy, scale = "free") + theme_minimal() + labs(x = NULL, y = "Area") + ggtitle("Artist behind painting \n from smallest to largest") + theme(axis.text.x = element_text(angle = 15))
title is bit messy -- use artist name instead. Only four in smallest as Lehanneur has painted two of the smallest
ggplot(LargeSmall, aes(x = reorder(Artist, area), y = area)) + geom_bar(stat = "identity") + facet_wrap(~ dummy, scale = "free") + theme_minimal() + labs(x = NULL, y = "Area") + ggtitle("Artist behind painting \n from smallest to largest") + theme(axis.text.x = element_text(angle = 15))