p= ggplot(data =df.plot, aes(x=DateAcquired, y=cum, colour="red"))
p= p + geom_freqpoly(binwidth=100, fill="red", stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)")
plot(p)
removing missing values on its own
freqpoly is an appropriate one. its good at showing data over time
p= ggplot(df.agg.merge, aes(x=DateAcquired, y=cum, colour=CuratorApproved))
p= p + geom_freqpoly(binwidth=100, stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)")
plot(p)
p= ggplot(df.agg, aes(x=Department, y=antal))
p= p + geom_histogram(binwidth=100, stat="identity") + labs(x = "Department", y = "Art acquired", title = "Number of art acquired by department")
plot(p)
Question 6
making a data frame with number of art from each artist and removing missings by the filter condition
Question 7 -The variable ArtistBio lists the birth place of each painter. Use this information to create a world map where each country is colored according to the stock of paintings in MOMA’s collection.
We see that the dfmomamap dataset holds 2221 observation, which is short 8 from our original df.
Due to geographical changes or missing information, we have omit 2 Yugoslavian, 2 Korean, and 1 Congolese paintings and 3 with no information (8 in all).
Loading potentiel packages
library("readr") library("countrycode") library("mapdata") library("ggmap") library("dplyr") library("knitr") library("lubridate") library("maps") library("ggplot2") library("stringr") library("reshape2")
Question 1 - Create a new dataframe of the stock of paintings at MOMA for each month in the year.
Load raw data and filter paintings.
df0 = read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv") df = df0 %>% filter(Classification == 'Painting')
Create month variable.
df= df[order(df$DateAcquired),] df= na.omit(df) df$MonthAcquired = paste(year(df$DateAcquired), str_pad(month(df$DateAcquired), 2, pad = "0"),sep = "")
Create dataframe with stock of paintings for each month.
by_MonthAcquired <- group_by(df, MonthAcquired) stock <- df %>% group_by(MonthAcquired) %>% summarise(NewPaintings = n()) %>% data.frame stock$stockofpaintings=cumsum(stock$NewPaintings) stockbymonth=select(stock,MonthAcquired,stockofpaintings)
Question 2
sorting by date and fixing cumulative variable
df= df[order(df$DateAcquired),] df$antal=1
df.plot = df %>% group_by(DateAcquired) %>% summarise(antal = sum(antal)) %>% data.frame df.plot$cum=cumsum(df.plot$antal)
p= ggplot(data =df.plot, aes(x=DateAcquired, y=cum, colour="red")) p= p + geom_freqpoly(binwidth=100, fill="red", stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)") plot(p)
removing missing values on its own
freqpoly is an appropriate one. its good at showing data over time
Question 3
sorting data
df= df[order(df$CuratorApproved, df$DateAcquired),]
preparing two data frames. one with CuratorApproved=N and one with CuratorApproved=Y and calculating cumulatives
Curatorapproved = N
df.agg0 = df %>% filter(CuratorApproved == 'N') %>% select(CuratorApproved, DateAcquired, antal) %>% data.frame()
df.plot1 = df.agg0 %>% group_by(DateAcquired, CuratorApproved) %>% summarise(antal = sum(antal)) %>% data.frame df.plot1$cum=cumsum(df.plot1$antal)
curatorapproved = Y
df.agg00 = df %>% filter(CuratorApproved == 'Y') %>% select(CuratorApproved, DateAcquired, antal) %>% data.frame()
df.plot2 = df.agg00 %>% group_by(DateAcquired, CuratorApproved) %>% summarise(antal = sum(antal)) %>% data.frame df.plot2$cum=cumsum(df.plot2$antal)
merging the two data frames and plotting
df.agg.merge= rbind(df.plot1, df.plot2)
p= ggplot(df.agg.merge, aes(x=DateAcquired, y=cum, colour=CuratorApproved)) p= p + geom_freqpoly(binwidth=100, stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)") plot(p)
Question 4
DF grouped by DEPARTMENT???
Preparing data frame..
df.agg = df %>% group_by(Department) %>% summarise(antal = sum(antal)) %>% data.frame
Question 5
PLotting
p= ggplot(df.agg, aes(x=Department, y=antal)) p= p + geom_histogram(binwidth=100, stat="identity") + labs(x = "Department", y = "Art acquired", title = "Number of art acquired by department") plot(p)
Question 6
making a data frame with number of art from each artist and removing missings by the filter condition
df.agg2 = df %>% group_by(Artist) %>% summarise(antal = sum(antal)) %>% filter(Artist>0) %>% data.frame
sorting by antal to see who has the most paintings..
df.agg2= df.agg2[order(df.agg2$antal, decreasing = TRUE),]
printing top 10
head(df.agg2,10)
Question 7 -The variable ArtistBio lists the birth place of each painter. Use this information to create a world map where each country is colored according to the stock of paintings in MOMA’s collection.
Extracting artist demonym from ArtistBio
df$demonym=gsub("(", "", df$ArtistBio) df$demonym=gsub(",", "", df$demonym) df$demonym=gsub(")", "", df$demonym) df$demonym=str_extract(df$demonym, "[A-z]*") table(df$demonym)
Appropriate changes:
df$demonym = gsub("Argentine","Argentinean",df$demonym) df$demonym = gsub("Icelandic","Icelander",df$demonym) df$demonym = gsub("South","South African",df$demonym)
However, some artist were born in another country, which requires more extraction and some clean-up.
We create a string vector for birth country based on the characters succeeding "born " in ArtistBio
df$ArtistBio = gsub("[[:punct:]]","",df$ArtistBio) df$ArtistBio = gsub("\d+","",df$ArtistBio) df$ArtistBio = str_trim(df$ArtistBio,side = "both") # Trim for whitespaces df$birth = str_extract(df$ArtistBio, "born .*$")
There are some unwanted words. The geographical information is changed to the equivalent present day country.
df$birth = gsub("born ","",df$birth) df$birth = gsub("in ","",df$birth) df$birth = gsub("[Tt]he ","",df$birth) df$birth = str_trim(df$birth,side = "both") # Trim for whitespaces
Appropriate changes:
df$birth = gsub("American born","United States",df$birth) df$birth = gsub("AustriaHungary","Austria",df$birth) df$birth = gsub("Beirut Lebanon","Lebanon",df$birth) df$birth = gsub("Beirut Lebanon","Lebanon",df$birth) df$birth = gsub("Belorussia now Belarus","Belarus",df$birth) df$birth = gsub("Czechoslovakia","Czech Republic",df$birth) # SKAL DEN VÆRE HER? df$birth = gsub("founded American American American American American American American American American United States","United States",df$birth) df$birth = gsub("Germany Alsace","France",df$birth) df$birth = gsub("German born","Germany",df$birth) df$birth = gsub("Germany Austrian born","Austria",df$birth) df$birth = gsub("Germany To England","Germany",df$birth) df$birth = gsub("Great Brita To Zimbabwe","United Kingdom",df$birth) # Removed "in" earlier df$birth = gsub("Great Britain","United Kingdom",df$birth) df$birth = gsub("Italy British born","United Kingdom",df$birth) df$birth = gsub("Russia American Russia","Russia",df$birth) df$birth = gsub("Russia now Latvia","Latvia",df$birth) df$birth = gsub("Scotland","United Kingdom",df$birth) df$birth = gsub("Tunis","Tunisia",df$birth) df$birth = gsub("USA","United States",df$birth)
An issue is that we both have countries as names and de,onyms
Using the a dataset on countries from mledoze, we match demonyms and birthpplace, respectively, with the ISO3 code.
As demonyms can refer to several ISO3 countries, we match birthplace with demonym.
Thereby we can a standardised measure
dfmledoze = read.csv("https://raw.githubusercontent.com/mledoze/countries/master/dist/countries.csv",sep = ";") dfmledoze$name=word(dfmledoze$name, 1, sep = fixed(","))
Nation borders can be a political discussion, but to to simplyfy the map,
we only include a country's linked "mother" country and its ISO3 code, according to the following dataset by ilyabo.
dfilyabo = read.csv("https://raw.githubusercontent.com/ilyabo/aiddata/master/data/static/data/countries-iso2-iso3.csv", sep=",")
We make some adjustments and clean up:
dfilyabo$linked_country <- as.character(dfilyabo$linkedcountry) dfilyabo$country <- as.character(dfilyabo$country) dfilyabo$iso3 <- as.character(dfilyabo$iso3) dfilyabo$country = gsub("Afganistan","Afghanistan",dfilyabo$country) dfilyabo$iso3 = ifelse(grepl("^[^]",dfilyabo$linked_country),"",dfilyabo$iso3) dfilyabo=select(dfilyabo,iso3,country) dfilyabo=dfilyabo[!(is.na(dfilyabo$iso3) | dfilyabo$iso3==""), ]
Change name "country" to "name" and merge datasets.
names(dfilyabo)[names(dfilyabo)=="country"] <- "name" dfmledoze=right_join(dfmledoze,dfilyabo,by="name")
From here we have a list
dfcountry=select(dfmledoze,name,demonym,iso3) dfcountry$demonym <- as.character(dfcountry$demonym) dfcountry=dfcountry[complete.cases(dfcountry),]
dfmomamap=right_join(df,dfcountry,by="demonym")
If ther
dfmomamap$name = ifelse(is.na(dfmomamap$birth),dfmomamap$name,dfmomamap$birth) dfmomamap=select(dfmomamap,MoMANumber,name) dfmomamap=dfmomamap[complete.cases(dfmomamap),]
We see that the dfmomamap dataset holds 2221 observation, which is short 8 from our original df.
Due to geographical changes or missing information, we have omit 2 Yugoslavian, 2 Korean, and 1 Congolese paintings and 3 with no information (8 in all).
Count number of paintings for each country
dfmomamap = dfmomamap %>% group_by(name) %>% summarise(n()) %>% data.frame
Create World map
world=map_data("world") names(dfmomamap)[names(dfmomamap)=="name"] <- "region"
Some adjustments are made, in order to match country codes.
dfmomamap$region = gsub("United States","USA",dfmomamap$region) dfmomamap$region = gsub("United Kingdom","UK",dfmomamap$region) worldmoma=left_join(world,dfmomamap)
worldmoma$brk <- cut(worldmoma$n.., breaks=c(0,1,10,100,500,1000), labels=c('1' ,'2-10' ,'11-100' ,'101-500' ,'500+'), include.lowest=TRUE) # define appropriate (& nicely labeled) population breaks
p = ggplot(worldmoma, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = brk),colour="black",size=0.2)
p<- p +scale_fill_brewer(palette = "BuPu",na.value="lightgrey")
print(p)
Q8
Only use art that is a painting
df.painting <- df[which(df$Classification=='Painting'), ]
Steps to subtract the dimensions in cm from variable "Dimensions"
Get the parenthesis and what is inside
df.painting$maal <- str_extract(df.painting$Dimensions, "([^()]+)")
Remove parenthesis
df.painting$maal <- substring(df.painting$maal, 2, nchar(df.painting$maal)-1)
Remove unecessary info such as cm and x
df.painting$maal = gsub("cm", "", df.painting$maal) df.painting$maal = gsub("x", "", df.painting$maal)
Split into two variables, height & length, by space
df.painting2 <- colsplit(df.painting$maal," ",c("height","length"))
Merging into one dataframe
df.painting <- data.frame(df.painting, df.painting2)
Converting characters to numeric
df.painting$height = as.numeric(as.character(df.painting$height)) df.painting$length = as.numeric(as.character(df.painting$length))
Calculating area in cm's
df.painting$area = df.painting$height * df.painting$length
Making new datasets sorted by area either ascending or descending
df.biggest <- df.painting[order(-df.painting$area),] df.smallest <- df.painting[order(df.painting$area),]
Only include the 5 biggest or smallest
df.biggest = df.biggest[1:5,] df.smallest = df.smallest[1:5,]
Combine into one data frame
df.bigandsmal <- rbind(df.biggest, df.smallest)