sebastianbarfort / sds

Social Data Science, course at University of Copenhagen
http://sebastianbarfort.github.io/sds/
12 stars 17 forks source link

Group 009: Assignment 1 #6

Closed RasmusGars closed 9 years ago

RasmusGars commented 9 years ago

Loading potentiel packages

library("readr") library("countrycode") library("mapdata") library("ggmap") library("dplyr") library("knitr") library("lubridate") library("maps") library("ggplot2") library("stringr") library("reshape2")

Question 1 - Create a new dataframe of the stock of paintings at MOMA for each month in the year.

Load raw data and filter paintings.

df0 = read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv") df = df0 %>% filter(Classification == 'Painting')

Create month variable.

df= df[order(df$DateAcquired),] df= na.omit(df) df$MonthAcquired = paste(year(df$DateAcquired), str_pad(month(df$DateAcquired), 2, pad = "0"),sep = "")

Create dataframe with stock of paintings for each month.

by_MonthAcquired <- group_by(df, MonthAcquired) stock <- df %>% group_by(MonthAcquired) %>% summarise(NewPaintings = n()) %>% data.frame stock$stockofpaintings=cumsum(stock$NewPaintings) stockbymonth=select(stock,MonthAcquired,stockofpaintings)

Question 2

sorting by date and fixing cumulative variable

df= df[order(df$DateAcquired),] df$antal=1

df.plot = df %>% group_by(DateAcquired) %>% summarise(antal = sum(antal)) %>% data.frame df.plot$cum=cumsum(df.plot$antal)

p= ggplot(data =df.plot, aes(x=DateAcquired, y=cum, colour="red")) p= p + geom_freqpoly(binwidth=100, fill="red", stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)") plot(p)

removing missing values on its own

freqpoly is an appropriate one. its good at showing data over time

Question 3

sorting data

df= df[order(df$CuratorApproved, df$DateAcquired),]

preparing two data frames. one with CuratorApproved=N and one with CuratorApproved=Y and calculating cumulatives

Curatorapproved = N

df.agg0 = df %>% filter(CuratorApproved == 'N') %>% select(CuratorApproved, DateAcquired, antal) %>% data.frame()

df.plot1 = df.agg0 %>% group_by(DateAcquired, CuratorApproved) %>% summarise(antal = sum(antal)) %>% data.frame df.plot1$cum=cumsum(df.plot1$antal)

curatorapproved = Y

df.agg00 = df %>% filter(CuratorApproved == 'Y') %>% select(CuratorApproved, DateAcquired, antal) %>% data.frame()

df.plot2 = df.agg00 %>% group_by(DateAcquired, CuratorApproved) %>% summarise(antal = sum(antal)) %>% data.frame df.plot2$cum=cumsum(df.plot2$antal)

merging the two data frames and plotting

df.agg.merge= rbind(df.plot1, df.plot2)

p= ggplot(df.agg.merge, aes(x=DateAcquired, y=cum, colour=CuratorApproved)) p= p + geom_freqpoly(binwidth=100, stat="identity") + labs(x = "Time", y = "Art acquired", title = "Number of art acquired over time (cumulative)") plot(p)

Question 4

DF grouped by DEPARTMENT???

Preparing data frame..

df.agg = df %>% group_by(Department) %>% summarise(antal = sum(antal)) %>% data.frame

Question 5

PLotting

p= ggplot(df.agg, aes(x=Department, y=antal)) p= p + geom_histogram(binwidth=100, stat="identity") + labs(x = "Department", y = "Art acquired", title = "Number of art acquired by department") plot(p)

Question 6

making a data frame with number of art from each artist and removing missings by the filter condition

df.agg2 = df %>% group_by(Artist) %>% summarise(antal = sum(antal)) %>% filter(Artist>0) %>% data.frame

sorting by antal to see who has the most paintings..

df.agg2= df.agg2[order(df.agg2$antal, decreasing = TRUE),]

printing top 10

head(df.agg2,10)

Question 7 -The variable ArtistBio lists the birth place of each painter. Use this information to create a world map where each country is colored according to the stock of paintings in MOMA’s collection.

Extracting artist demonym from ArtistBio

df$demonym=gsub("(", "", df$ArtistBio) df$demonym=gsub(",", "", df$demonym) df$demonym=gsub(")", "", df$demonym) df$demonym=str_extract(df$demonym, "[A-z]*") table(df$demonym)

Appropriate changes:

df$demonym = gsub("Argentine","Argentinean",df$demonym) df$demonym = gsub("Icelandic","Icelander",df$demonym) df$demonym = gsub("South","South African",df$demonym)

However, some artist were born in another country, which requires more extraction and some clean-up.

We create a string vector for birth country based on the characters succeeding "born " in ArtistBio

df$ArtistBio = gsub("[[:punct:]]","",df$ArtistBio) df$ArtistBio = gsub("\d+","",df$ArtistBio) df$ArtistBio = str_trim(df$ArtistBio,side = "both") # Trim for whitespaces df$birth = str_extract(df$ArtistBio, "born .*$")

There are some unwanted words. The geographical information is changed to the equivalent present day country.

df$birth = gsub("born ","",df$birth) df$birth = gsub("in ","",df$birth) df$birth = gsub("[Tt]he ","",df$birth) df$birth = str_trim(df$birth,side = "both") # Trim for whitespaces

Appropriate changes:

df$birth = gsub("American born","United States",df$birth) df$birth = gsub("AustriaHungary","Austria",df$birth) df$birth = gsub("Beirut Lebanon","Lebanon",df$birth) df$birth = gsub("Beirut Lebanon","Lebanon",df$birth) df$birth = gsub("Belorussia now Belarus","Belarus",df$birth) df$birth = gsub("Czechoslovakia","Czech Republic",df$birth) # SKAL DEN VÆRE HER? df$birth = gsub("founded American American American American American American American American American United States","United States",df$birth) df$birth = gsub("Germany Alsace","France",df$birth) df$birth = gsub("German born","Germany",df$birth) df$birth = gsub("Germany Austrian born","Austria",df$birth) df$birth = gsub("Germany To England","Germany",df$birth) df$birth = gsub("Great Brita To Zimbabwe","United Kingdom",df$birth) # Removed "in" earlier df$birth = gsub("Great Britain","United Kingdom",df$birth) df$birth = gsub("Italy British born","United Kingdom",df$birth) df$birth = gsub("Russia American Russia","Russia",df$birth) df$birth = gsub("Russia now Latvia","Latvia",df$birth) df$birth = gsub("Scotland","United Kingdom",df$birth) df$birth = gsub("Tunis","Tunisia",df$birth) df$birth = gsub("USA","United States",df$birth)

An issue is that we both have countries as names and de,onyms

Using the a dataset on countries from mledoze, we match demonyms and birthpplace, respectively, with the ISO3 code.

As demonyms can refer to several ISO3 countries, we match birthplace with demonym.

Thereby we can a standardised measure

dfmledoze = read.csv("https://raw.githubusercontent.com/mledoze/countries/master/dist/countries.csv",sep = ";") dfmledoze$name=word(dfmledoze$name, 1, sep = fixed(","))

Nation borders can be a political discussion, but to to simplyfy the map,

we only include a country's linked "mother" country and its ISO3 code, according to the following dataset by ilyabo.

dfilyabo = read.csv("https://raw.githubusercontent.com/ilyabo/aiddata/master/data/static/data/countries-iso2-iso3.csv", sep=",")

We make some adjustments and clean up:

dfilyabo$linked_country <- as.character(dfilyabo$linkedcountry) dfilyabo$country <- as.character(dfilyabo$country) dfilyabo$iso3 <- as.character(dfilyabo$iso3) dfilyabo$country = gsub("Afganistan","Afghanistan",dfilyabo$country) dfilyabo$iso3 = ifelse(grepl("^[^]",dfilyabo$linked_country),"",dfilyabo$iso3) dfilyabo=select(dfilyabo,iso3,country) dfilyabo=dfilyabo[!(is.na(dfilyabo$iso3) | dfilyabo$iso3==""), ]

Change name "country" to "name" and merge datasets.

names(dfilyabo)[names(dfilyabo)=="country"] <- "name" dfmledoze=right_join(dfmledoze,dfilyabo,by="name")

From here we have a list

dfcountry=select(dfmledoze,name,demonym,iso3) dfcountry$demonym <- as.character(dfcountry$demonym) dfcountry=dfcountry[complete.cases(dfcountry),]

dfmomamap=right_join(df,dfcountry,by="demonym")

If ther

dfmomamap$name = ifelse(is.na(dfmomamap$birth),dfmomamap$name,dfmomamap$birth) dfmomamap=select(dfmomamap,MoMANumber,name) dfmomamap=dfmomamap[complete.cases(dfmomamap),]

We see that the dfmomamap dataset holds 2221 observation, which is short 8 from our original df.

Due to geographical changes or missing information, we have omit 2 Yugoslavian, 2 Korean, and 1 Congolese paintings and 3 with no information (8 in all).

Count number of paintings for each country

dfmomamap = dfmomamap %>% group_by(name) %>% summarise(n()) %>% data.frame

Create World map

world=map_data("world") names(dfmomamap)[names(dfmomamap)=="name"] <- "region"

Some adjustments are made, in order to match country codes.

dfmomamap$region = gsub("United States","USA",dfmomamap$region) dfmomamap$region = gsub("United Kingdom","UK",dfmomamap$region) worldmoma=left_join(world,dfmomamap)

worldmoma$brk <- cut(worldmoma$n.., breaks=c(0,1,10,100,500,1000), labels=c('1' ,'2-10' ,'11-100' ,'101-500' ,'500+'), include.lowest=TRUE) # define appropriate (& nicely labeled) population breaks

p = ggplot(worldmoma, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = brk),colour="black",size=0.2)

p<- p +scale_fill_brewer(palette = "BuPu",na.value="lightgrey")
print(p)

Q8

Only use art that is a painting

df.painting <- df[which(df$Classification=='Painting'), ]

Steps to subtract the dimensions in cm from variable "Dimensions"

Get the parenthesis and what is inside

df.painting$maal <- str_extract(df.painting$Dimensions, "([^()]+)")

Remove parenthesis

df.painting$maal <- substring(df.painting$maal, 2, nchar(df.painting$maal)-1)

Remove unecessary info such as cm and x

df.painting$maal = gsub("cm", "", df.painting$maal) df.painting$maal = gsub("x", "", df.painting$maal)

Split into two variables, height & length, by space

df.painting2 <- colsplit(df.painting$maal," ",c("height","length"))

Merging into one dataframe

df.painting <- data.frame(df.painting, df.painting2)

Converting characters to numeric

df.painting$height = as.numeric(as.character(df.painting$height)) df.painting$length = as.numeric(as.character(df.painting$length))

Calculating area in cm's

df.painting$area = df.painting$height * df.painting$length

Making new datasets sorted by area either ascending or descending

df.biggest <- df.painting[order(-df.painting$area),] df.smallest <- df.painting[order(df.painting$area),]

Only include the 5 biggest or smallest

df.biggest = df.biggest[1:5,] df.smallest = df.smallest[1:5,]

Combine into one data frame

df.bigandsmal <- rbind(df.biggest, df.smallest)

sebastianbarfort commented 9 years ago

I think this is a good assignment.

Good use of the cut function.

You can use the n() function in the dplyr package to count cases.

Nice customization of the color scale in the world map plot.

APPROVED