sebastianbarfort / sds

Social Data Science, course at University of Copenhagen
http://sebastianbarfort.github.io/sds/
12 stars 17 forks source link

Group 6: Assignment 1 #5

Closed BCEgerod closed 9 years ago

BCEgerod commented 9 years ago

library("readr"); library(dplyr); library(stringr); library(ggplot2); library(countrycode); library(maps); library(knitr)

df <- read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv")

QUESTION 1 ----------------------------------------------------------------------

df with stock of paintings for each month in the year

dates <- str_split(df$DateAcquired, pattern = "-") # split date of acquiry to isolate month dates <- do.call(rbind.data.frame, dates) df <- bind_cols(df, dates) names(df)[15:17] <- c("year", "month", "day") df$month <- gsub(pattern = 0, replacement = "", df$month) # remove zero before months df$month <- as.numeric(df$month)

df with stock of paintings each month (across all years)

monthly.dat <- df %>% group_by(month) %>% summarise(length(ObjectID)) names(monthly.dat)[2] <- "No.Paintings"

QUESTION 2 ----------------------------------------------------------------------

plot the number of new acquiries pr. month

ggplot(monthly.dat, aes(x = month, y = No.Paintings)) + geom_line(colour = "red") + # use lines to gauge ts fluctuations. Colour red theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries") + scale_x_continuous(breaks = 1:12)

stock for each month every year

df$YearMonth <- paste(df$year, df$month, sep = "." ) # bind together year and month df$YearMonth <- as.numeric(df$YearMonth) # make numeric

count of paintings by month in each year

YM.dat <- df %>% group_by(YearMonth) %>% summarise(length(ObjectID)) names(YM.dat)[2] <- "No.Paintings"

ggplot(YM.dat, aes(x = YearMonth, y = No.Paintings)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries")

QUESTION 3 ---------------------------------------------------------------------------------------------------------

stock of paintings that are curator vs. non-curator approved

first the stock in each month summed across all years

MonthCur.dat <- df %>% group_by(month, CuratorApproved) %>% summarise(length(ObjectID)) names(MonthCur.dat)[3] <- "No.Paintings"

ggplot(MonthCur.dat, aes(x = month, y = No.Paintings, colour = CuratorApproved)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month and curator approval") + labs(x = "Month", y = "New acquiries") + scale_x_continuous(breaks = 1:12) + scale_fill_discrete(name = "CuratorApproved", breaks = c("N", "Y"), labels = c("No", "Yes"))

now the stock in each month across all years

YM_approved.dat <- df %>% group_by(YearMonth, CuratorApproved) %>% summarise(length(ObjectID)) names(YM_approved.dat)[3] <- "No.Paintings"

ggplot(YM_approved.dat, aes(x = YearMonth, y = No.Paintings, colour = CuratorApproved)) + geom_line() + theme_minimal() + ggtitle("Number of new acquiries \n by month") + labs(x = "Month", y = "New acquiries")

QUESTION 4 ----------------------------------------------------------------------------------------------------------

total stock of paintings by department to gauge which is largest

department.dat <- df %>% group_by(Department) %>% summarise(length(ObjectID)) names(department.dat)[2] <- "No.Paintings"

cumulative stock of paintings by department to gauge largest increase

cum.dep.dat <- df %>% count(Department, year, month) %>% group_by(Department)

cum.dep.dat$cumulative <- order_by(cum.dep.dat$Department, cumsum(cum.dep.dat$n))

paste year and month together

cum.dep.dat$YearMonth <- paste(cum.dep.dat$year, cum.dep.dat$month, sep = "." ) # bind together year and month cum.dep.dat$YearMonth <- as.numeric(cum.dep.dat$YearMonth)

QUESTION 5 ------------------------------------------------------------------------------------------------

barplot of the stock of paintings by department

ggplot(department.dat, aes(y = No.Paintings, x = reorder(Department, No.Paintings))) + geom_bar(stat = "identity") + theme_minimal() + theme(axis.text.x = element_text(angle = 20)) + labs(y = "New acquiries", x = "Department") + ggtitle(expression(atop("Number of new acquiries \n by department", atop(italic("Departments in decending order"), "")))) + coord_flip()

across all years, Prints & Illustrated books has recevied most new acquiries

plot cumulative changes over time to gauge which department has largest increase ins stock

ggplot(cum.dep.dat, aes(y = cumulative, x = YearMonth, colour = Department)) + geom_line() + theme_minimal() + labs(y = "Cumulative count \n of new acquiries", x = "Time") + ggtitle("Cumulative count of new acquiries \n by department")

Prints & Illustrated books also has the largest increase in number of paintings

QUESTION 6 --------------------------------------------------------------------------------------------------------

count of paintings pr. artist

Artist.dat <- df %>% group_by(Artist) %>% summarise(length(ObjectID)) names(Artist.dat)[2] <- "No.Paintings"

Artist.dat <- Artist.dat[order(-Artist.dat$No.Paintings), ]

Artist.dat$Artist[6] <- NA # deal with missing names

artist.plot <- na.omit(Artist.dat[1:11, ])

ggplot(artist.plot, aes(x = reorder(Artist, No.Paintings), y = No.Paintings)) + geom_bar(stat = "identity") + theme_minimal() + coord_flip() + labs(x = "Artist", y = "Number of paintings") + ggtitle(expression(atop("Paintings in stock by artist", atop(italic("Artists in decending order"), ""))))

maybe "Unknown photographer" isn't an artist name - plot with those removed

Artist.dat$Artist[4] <- NA

artist.plot2 <- na.omit(Artist.dat[1:11, ])

ggplot(artist.plot2, aes(x = reorder(Artist, No.Paintings), y = No.Paintings)) + geom_bar(stat = "identity") + theme_minimal() + coord_flip() + labs(x = "Artist", y = "Number of paintings") + ggtitle(expression(atop("Paintings in stock by artist", atop(italic("Artists in decending order"), ""))))

QUESTION 7 -------------------------------------------------------------------------------------------------------

first, extract birthplace of artist

convert to lower case

df$country <- tolower(df$ArtistBio)

remove all numbers

df$country <- gsub(x = df$country, pattern = "[^a-z]", replacement = " ")

extract everyting after 'born' to get country of birth - not nationality

df$country2 <- strextract(string = df$country, "born .[a-z]") df$country2 <- sub(x = df$country2, pattern = "born ", replacement = "\1") df$country2 <- gsub(x = df$country2, '([a-z]+) ._', '\1') # get first country mentioned after 'born'

for all artist where nationality = country of birth

df$country1 <- strextract(string = df$country, pattern = "(.[a-z, ])") df$country1 <- gsub(x = df$country1, '([a-z]+) ._', '\1')

if country of birth isn't different from nationality save nationality, otherwise country of birth

df$country3 <- ifelse(is.na(df$country2), df$country1, df$country2) df$country3 <- gsub(df$country3, pattern = " ", replacement = "") # remove whitespace

convert some nationalities to country names

df$country4 <- countrycode(df$country3, "country.name", "country.name") df$country4 <- ifelse(is.na(df$country4), df$country3, df$country4)

countries <- as.data.frame(unique(df$country4))

rest of conversion done manually. There is probably a better way - this is messy

df$country4[df$country4== "american"] <- "USA" df$country4[df$country4== "german"] <- "Germany" df$country4[df$country4== "austrian"] <- "Austria" df$country4[df$country4== "vienna"] <- "Austria" df$country4[df$country4== "french"] <- "France" df$country4[df$country4== "bolivian"] <- "Bolivia" df$country4[df$country4== "swiss"] <- "Switzerland" df$country4[df$country4== "spanish"] <- "Spain" df$country4[df$country4== "catalan"] <- "Spain" df$country4[df$country4== "dutch"] <- "Netherlands" df$country4[df$country4== "russian"] <- "Russia" df$country4[df$country4== "italian"] <- "Italy" df$country4[df$country4== "british"] <- "UK" df$country4[df$country4== "swedish"] <- "Sweden" df$country4[df$country4== "mexican"] <- "Mexico" df$country4[df$country4== "polish"] <- "Poland" df$country4[df$country4== "ukrainian"] <- "Ukraine" df$country4[df$country4== "uruguayan"] <- "Uruguay" df$country4[df$country4== "venezuelan"] <- "Venezuela" df$country4[df$country4== "indian"] <- "India" df$country4[df$country4== "vietnamese"] <- "Vietnam" df$country4[df$country4== "zimbabwean"] <- "Zimbabwe" df$country4[df$country4== "tunisian"] <- "Tunisia" df$country4[df$country4== "turkish"] <- "Turkey" df$country4[df$country4== "brazilian"] <- "Brazil" df$country4[df$country4== "japanese"] <- "Japan" df$country4[df$country4== "chinese"] <- "China" df$country4[df$country4== "belgian"] <- "Belgium" df$country4[df$country4== "argentine"] <- "Argentine" df$country4[df$country4== "alsace"] <- "France" df$country4[df$country4== "chilean"] <- "Chile" df$country4[df$country4== "cuban"] <- "Cuba" df$country4[df$country4== "danish"] <- "Denmark" df$country4[df$country4== "guatemalan"] <- "Guatemala" df$country4[df$country4== "colombian"] <- "Colombia" df$country4[df$country4== "croatian"] <- "Croatia" df$country4[df$country4== "egyptian"] <- "Egypt" df$country4[df$country4== "finnish"] <- "Finland" df$country4[df$country4== "greek"] <- "Grecce" df$country4[df$country4== "haitian"] <- "Haiti" df$country4[df$country4== "hungarian"] <- "Hungary" df$country4[df$country4== "iranian"] <- "Iran" df$country4[df$country4== "latvian"] <- "Latvia" df$country4[df$country4== "pakistani"] <- "Pakistan" df$country4[df$country4== "peruvian"] <- "Peru" df$country4[df$country4== "portuguese"] <- "Portugal" df$country4[df$country4== "romanian"] <- "Romania" df$country4[df$country4== "scottish"] <- "Scotland" df$country4[df$country4== "yugoslav"] <- "Yugoslavia" df$country4[df$country4== "canadian"] <- "Canada" df$country4[df$country4== "england"] <- "UK" df$country4[df$country4== "United States"] <- "USA" df$country4[df$country4== "maroccan"] <- "Marocco" df$country4[df$country4== "english"] <- "UK" df$country4[df$country4== "colombian"] <- "Colombia" df$country4[df$country4== "norwegian"] <- "Norway" df$country4[df$country4== "czech"] <- "Czech Republic" df$country4[df$country4== "amsterdam"] <- "Netherlands" df$country4[df$country4== "moravia"] <- "Czech Republic" df$country4[df$country4== "silesia"] <- "Germany" df$country4[df$country4== "belorussia"] <- "Belarus" df$country4[df$country4== "puerto"] <- "Puerto Rico" df$country4[df$country4== "finish"] <- "Finland" df$country4[df$country4== "transylvania"] <- "Romania" df$country4[df$country4== "irish"] <- "Ireland" df$country4[df$country4== "costa"] <- "Costa Rica" df$country4[df$country4== "lituania"] <- "Lithuania" df$country4[df$country4== "america"] <- "USA" df$country4[df$country4== "malian"] <- "Mali" df$country4[df$country4== "wales"] <- "UK" df$country4[df$country4== "hawaii"] <- "USA" df$country4[df$country4== "thai"] <- "Thailand" df$country4[df$country4== "tunis"] <- "Tunisia" df$country4[df$country4== "guyanese"] <- "Guyana" df$country4[df$country4== "constantinople"] <- "Turkey" df$country4[df$country4== "holland"] <- "Netherlands" df$country4[df$country4== "bahamian"] <- "Bahamas" df$country4[df$country4== "hong"] <- "Hong Kong" df$country4[df$country4== "lebanese"] <- "Lebanon" df$country4[df$country4== "ivorian"] <- "Ivory Coast" df$country4[df$country4== "jackson"] <- "USA" df$country4[df$country4== "nowegian"] <- "Norway" df$country4[df$country4== "barcelona"] <- "Spain" df$country4[df$country4== "filipino"] <- "Philippines" df$country4[df$country4== "ukranian"] <- "Ukraine" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "zagreb"] <- "Croatia" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "saudi"] <- "Saudi Arabia" df$country4[df$country4== "welsh"] <- "UK" df$country4[df$country4== "mozambican"] <- "Mozambique" df$country4[df$country4== "scotland"] <- "UK" df$country4[df$country4== "moroccan"] <- "Marocco" df$country4[df$country4== "columbian"] <- "Colombia" df$country4[df$country4== "Russian Federation"] <- "USSR"

get country level number of paintings

country.dat <- df %>% group_by(country4) %>% summarise(length(ObjectID)) names(country.dat)[2] <- "No.Paintings"

extract long, lat and country name

GIS.data <- map_data("world") names(GIS.data) <- c("long", "lat", "group", "order", "country4", "subregion")

merge painting data with GIS

country.paintings <- left_join(country.dat, GIS.data)

plot -- heavily skewed distribution, can't distinguish between many countries

ggplot(country.paintings, aes(x = long, y = lat, group=group)) + geom_polygon(aes(fill = No.Paintings)) + expand_limits() + theme_minimal()

use natural log of paintings -- comparison across countries are difficult

ggplot(country.paintings, aes(x = long, y = lat, group=group)) + geom_polygon(aes(fill = log(No.Paintings))) + expand_limits() + theme_minimal()

QUESTION 8 ------------------------------------------------------------------------------------------------------

extract dimensions in cm's

df$dimCM<- sub(x = df$Dimensions, pattern=".((.)).*", replacement="\1")

extract numbers and digit seperator

df$dimCM <- gsub(x = df$dimCM, pattern = "[^0-9, .]", replacement = "")

extract first number -- width

df$width <- str_extract(string = df$dimCM, "[0-9]{0,3}[.]{0,1}[0-9]{1,3}") df$width <- as.numeric(df$width)

delete width to isolate height

df$height <- sub(x = df$dimCM, pattern ="[0-9]{0,3}[.]{0,1}[0-9]{1,3}", replacement = "")

extract height

df$height <- str_extract(string = df$height, "[0-9]{0,3}[.]{0,3}[0-9]{1,3}") df$height <- as.numeric(df$height)

compute area

df$area <- df$width*df$height

get five largest

LargeSmall <- as.data.frame(df[order(-df$area), ]) LargeSmall <- LargeSmall[1:5, ]

get five smallest

Smallest5 <- as.data.frame(df[order(df$area), ]) Smallest5$area[1:109] <- NA #remove DVD's and other non-paintings Smallest5 <- as.data.frame(Smallest5[order(Smallest5$area), ]) Smallest5$area[1] <- Smallest5$area[1]/10 # correct painting measured in mm's LargeSmall[6:10, ] <- Smallest5[1:5, ]

names(LargeSmall)[1] <- "Title"

group indicator for largest or smallest

LargeSmall$dummy[1:5] <- "5 Largest" LargeSmall$dummy[6:10] <- "5 Smallest"

plot largest and smallest respectively

ggplot(LargeSmall, aes(x = reorder(Title, area), y = area)) + geom_bar(stat = "identity") + facet_wrap(~ dummy, scale = "free") + theme_minimal() + labs(x = NULL, y = "Area") + ggtitle("Artist behind painting \n from smallest to largest") + theme(axis.text.x = element_text(angle = 15))

title is bit messy -- use artist name instead. Only four in smallest as Lehanneur has painted two of the smallest

ggplot(LargeSmall, aes(x = reorder(Artist, area), y = area)) + geom_bar(stat = "identity") + facet_wrap(~ dummy, scale = "free") + theme_minimal() + labs(x = NULL, y = "Area") + ggtitle("Artist behind painting \n from smallest to largest") + theme(axis.text.x = element_text(angle = 15))

sebastianbarfort commented 9 years ago

Very nice assignment.

Great plots and use of functions from dplyr and related packages.

I agree that your solution in question 7 is inefficient, see my solution instead.

Keep up the good work.

APPROVED