sebastianbarfort / sds

Social Data Science, course at University of Copenhagen
http://sebastianbarfort.github.io/sds/
12 stars 17 forks source link

Group 24: Assignment 1 #12

Closed ghost closed 9 years ago

ghost commented 9 years ago

Question #1 ---------------

read the Data Set

  library("readr")
  df = read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv")
  names(df)
  head(df)

to prepare the Data, need to extract the month from the date

  library("lubridate")
  df$month = month(df$DateAcquired,  label=TRUE)

create a new data frame by monthly stock

  df2 = data.frame(table(df$month))

naming the variables

  names(df2) = c("Month",  "Stock")

Question #2 --------

plot the graph

We believe that bar graph will be the most suitable for this data because we are ploting one variable against the frequency

  library(ggplot2)

  p = ggplot(data = df2, aes(x = Month, y = Stock))

we want the graph to be histogram (explain reson from above) stat="idenity" to let R know that it is a numerical, colour/fill to be RED, then we table the axis and give it a title and theme to be minimal

  p = p + geom_histogram(stat = "identity", color = "red", fill = "red") + theme_minimal() + labs(x ="Month", y = "The Stock of Painting", title = "The Stock of Paintings at MOMA for Each Month")
  p

Question #3 ------------

  df$month = month(df$DateAcquired, label = TRUE)
  df3 = data.frame(table(df$month, df$CuratorApproved))
  colnames(df3) = c("Month", "CuratorApproved", "Quantity")
  ggplot(df3, aes(x = Month, y = Quantity, fill = CuratorApproved )) + geom_histogram(stat = "identity")

or

  ggplot(df3, aes(x = Month, y = Quantity)) + geom_histogram(stat = "identity", aes(fill = factor(CuratorApproved))) + facet_wrap(~CuratorApproved)

Question #4 -----------

creating a new data frame

   df3 = data.frame(table(df$Department, df$month))

naming the variables

  names(df3) = c("Department", "Month", "Stock")

Question #5 ----------

#plotting the graphs 

#monthly increase for each department:
  p3 = ggplot(data = df3, aes(x = Month, y = Stock))
  p3 + geom_histogram(stat = "identity", color = "red", fill = "red") + facet_wrap( ~ Department)

#increase by department, seperated for every month:
  Ptry = ggplot(data = df3, aes( x = Department, y = Stock))
  Ptry + geom_histogram(stat = "identity", colour = "red", fill = "red") +facet_wrap( ~ Month)

 #general stock increase
  Ptry2 = ggplot(data = df3,aes(x = Department, y = Stock))
  Ptry2 + geom_histogram(stat = "identity", colour = "red", fill = "red")

Question #6 ------------

#make new data frame
  df4 = data.frame(table(df$Artist))
  names(df4)=c("Artist", "Stock")

#order by stock
  list = order(df4$Stock, decreasing = T)
  df4 = df4[list,]

#show top 12 of the Artist by stock
  df4_top12 = data.frame(df4$Artist[1:12], df4$Stock[1:12])

#delete "unknowns" or "empties"; yields Top10
  df4_top10 = df4_top12[-c(4,6),]
  names(df4_top10 ) = c("Artist", "Stock")
#result (has row numbering from previous top 12)
  df4_top10

Question #7 -------------

#clean up the data, by removing special character and renaming nationality into countries

  library(stringr)

  df$ArtistBio = gsub(",.*$", "", df$ArtistBio)
  df$ArtistBio = substr(df$ArtistBio, 2, nchar(df$ArtistBio))
  df$ArtistBio = gsub("and Swiss", "", df$ArtistBio)
  df$ArtistBio = gsub("", "", df$ArtistBio)
  df$ArtistBio = gsub("[1-9]","",df$ArtistBio)
  df$ArtistBio = str_replace_all(df$ArtistBio, "[:punct:]", "")
  df$ArtistBio = str_replace_all(df$ArtistBio, "[:digit:]", "")
  df$ArtistBio = gsub("Various", "", df$ArtistBio)
  df$ArtistBio = gsub("est", "", df$ArtistBio)
  df$ArtistBio = gsub("founded", "", df$ArtistBio)
  df$ArtistBio = gsub("ablished", "", df$ArtistBio)
  df$ArtistBio = gsub("born", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality unknown Nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Russian Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Russia Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("RussiaLithuanian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Saudi Arabian", "Saudi Arabia", df$ArtistBio)
  df$ArtistBio = gsub("American", "America", df$ArtistBio)
  df$ArtistBio = gsub("French", "France", df$ArtistBio)
  df$ArtistBio = gsub("Serbian", "Serbia", df$ArtistBio)
  df$ArtistBio = gsub("Romanian", "Romania", df$ArtistBio)
  df$ArtistBio = gsub("active", "", df$ArtistBio)
  df$ArtistBio = gsub("Active", "", df$ArtistBio)
  df$ArtistBio = gsub("Scottish", "Scotland", df$ArtistBio)
  df$ArtistBio = gsub("and British", "", df$ArtistBio)
  df$ArtistBio = gsub("and Canadian", "", df$ArtistBio)
  df$ArtistBio = gsub("and Mexican", "", df$ArtistBio)
  df$ArtistBio = gsub("and Swedish", "", df$ArtistBio)
  df$ArtistBio = gsub("and Danish", "", df$ArtistBio)
  df$ArtistBio = gsub("Ghanaian", "Ghana", df$ArtistBio)
  df$ArtistBio = gsub("Italian", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Italy Italy", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Japanese", "Japan",df$ArtistBio )
  df$ArtistBio = gsub("JapaneseAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("Japan Japan", "Japan", df$ArtistBio)
  df$ArtistBio = gsub("Korean", "Korea", df$ArtistBio)
  df$ArtistBio = gsub("Malaysian", "Malaysia", df$ArtistBio)
  df$ArtistBio = gsub("Mexican", "Mexico",df$ArtistBio )
  df$ArtistBio = gsub("Mexico Mexico", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Ghanaian", "Ghana", df$ArtistBio)
  df$ArtistBio = gsub("Italian", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Italy Italy", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Japanese", "Japan",df$ArtistBio) 
  df$ArtistBio = gsub("JapaneseAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("Japan Japan", "Japan", df$ArtistBio)
  df$ArtistBio = gsub("Korean", "South Korea", df$ArtistBio)
  df$ArtistBio = gsub("Malaysian", "Malaysia", df$ArtistBio)
  df$ArtistBio = gsub("Mexican", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("Mexico Mexico", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality Unknown", "",df$ArtistBio)
  df$ArtistBio = gsub("Spanish", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Portuguese", "Portugal",df$ArtistBio)
  df$ArtistBio = gsub("British", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("USA", "America", df$ArtistBio)
  df$ArtistBio = gsub("Native America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Osaka", "Japan",df$ArtistBio)
  df$ArtistBio = gsub("Norwegian", "Norway", df$ArtistBio)
  df$ArtistBio = gsub("New Zealander", "New Zealand", df$ArtistBio)
  df$ArtistBio = gsub("Australian ", "Australia", df$ArtistBio)
  df$ArtistBio = gsub("Canadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Austrian", "Austria", df$ArtistBio)
  df$ArtistBio = gsub("Nowegian", "Norway", df$ArtistBio)
  df$ArtistBio = gsub("November", "", df$ArtistBio)
  df$ArtistBio = gsub("Pakistani", "Pakistan", df$ArtistBio)
  df$ArtistBio = gsub("Polish", "Poland",df$ArtistBio)
  df$ArtistBio = gsub("Peruvian", "Peru",df$ArtistBio)
  df$ArtistBio = gsub("Poland Poland", "Poland",df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Senegalese", "Senegal", df$ArtistBio)
  df$ArtistBio = gsub("Singaporean", "Singapore", df$ArtistBio)
  df$ArtistBio = gsub("artists group", "", df$ArtistBio)
  df$ArtistBio = gsub("South African", "South Africa", df$ArtistBio)
  df$ArtistBio = gsub("Sudanese", "Sudan", df$ArtistBio)
  df$ArtistBio = gsub("Swedish", "Sweden", df$ArtistBio)
  df$ArtistBio = gsub("Swizerland Swizerland", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("Swiss", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("Syrian", "Syria", df$ArtistBio)
  df$ArtistBio = gsub("Taiwanese", "Taiwan", df$ArtistBio)
  df$ArtistBio = gsub("The Netherlands", "Netherlands", df$ArtistBio)
  df$ArtistBio = gsub("Thai", "Thailand", df$ArtistBio)
  df$ArtistBio = gsub("Tajik", "Tajikistan", df$ArtistBio)
  df$ArtistBio = gsub("Tanzanian", "Tanzania", df$ArtistBio)
  df$ArtistBio = gsub("Tunisian", "Tunisia", df$ArtistBio)
  df$ArtistBio = gsub("Turkish", "Turkey", df$ArtistBio)
  df$ArtistBio = gsub("and America", "", df$ArtistBio)
  df$ArtistBio = gsub("Britian Britian", "", df$ArtistBio)
  df$ArtistBio = gsub("America Germany", "America", df$ArtistBio)
  df$ArtistBio = gsub("America America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Britian France", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Belgian", "Belgium", df$ArtistBio)
  df$ArtistBio = gsub("British", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Brazilian", "Brazil", df$ArtistBio)
  df$ArtistBio = gsub("Argentinian", "Argentina", df$ArtistBio)
  df$ArtistBio = gsub("FranceMoroccan", "France", df$ArtistBio)
  df$ArtistBio = gsub("Latvian", "Latvia", df$ArtistBio)
  df$ArtistBio = gsub("Venezuelan", "Venezuela", df$ArtistBio)
  df$ArtistBio = gsub("Venezuela Venezuela", "Venezuela", df$ArtistBio)
  df$ArtistBio = gsub("Vietnamese", "Vietnam", df$ArtistBio)
  df$ArtistBio = gsub("Vietnam America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Welsh", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Yugoslav", "Yugoslavia", df$ArtistBio)
  df$ArtistBio = gsub("now Croatia", "", dfartistbio$country)
  df$ArtistBio = gsub("now Slovenia", "", dfartistbio$country)
  df$ArtistBio = gsub("Yugoslaviaian", "Yugoslavia", dfartistbio$country)
  df$ArtistBio = gsub("Zagreb", "Croatia", dfartistbio$country)
  df$ArtistBio = gsub("Zimbabwean", "Zimbabwe", dfartistbio$country)
  df$ArtistBio = gsub(" then Southern Rhodesia", "", df$ArtistBio)
  df$ArtistBio = gsub("Germany Germany Germany", "Germany", df$ArtistBio)
  df$ArtistBio = gsub("Moroccan", "Morocco", df$ArtistBio)
  df$ArtistBio = gsub("Mozambica", "Mozambique", df$ArtistBio)
  df$ArtistBio = gsub("Malian", "Mali", df$ArtistBio)
  df$ArtistBio = gsub("Mauritanian", "Mauritania", df$ArtistBio)
  df$ArtistBio = gsub("Egyptian", "Eqypt", df$ArtistBio)
  df$ArtistBio = gsub("English", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Dutch", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("Netherland Netherland", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("and Icelandic", "", df$ArtistBio)
  df$ArtistBio = gsub("Danish", "Denmark", df$ArtistBio)
  df$ArtistBio = gsub("Czech Czech", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech Republic RepublicRepublic", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech Republicoslovakian", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Cuban", "Cuba", df$ArtistBio)
  df$ArtistBio = gsub("Cuba Cuba", "Cuba", df$ArtistBio)
  df$ArtistBio = gsub("Colombian", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Columbian ", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Chilean", "Chili", df$ArtistBio)
  df$ArtistBio = gsub("Chinese", "China", df$ArtistBio)
  df$ArtistBio = gsub("Inuit", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Bulgarian", "Bulgria", df$ArtistBio)
  df$ArtistBio = gsub("Britian America", "Britian",df$ArtistBio)
  df$ArtistBio = gsub("of Palinian origin", "", df$ArtistBio)
  df$ArtistBio = gsub("and Iranian", "", df$ArtistBio)
  df$ArtistBio = gsub("Barcelona", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Bolivian", "Bolivia", df$ArtistBio)
  df$ArtistBio = gsub("Azerbaijani", "Azerbaijan", df$ArtistBio)
  df$ArtistBio = gsub("Australian", "Australia", df$ArtistBio)
  df$ArtistBio = gsub("Bahamian", "Bahamas", df$ArtistBio)
  df$ArtistBio = gsub("Great Britain To Zimbabwe", "Britain",df$ArtistBio)
  df$ArtistBio = gsub("Bosnian", "Bosnia and Herzegovina", df$ArtistBio)
  df$ArtistBio = gsub("bosnia and herzegovinaCroatian", "Bosnia and Herzegovina",df$ArtistBio)
  df$ArtistBio = gsub("bosnia and herzegovina", "Bosnia and Herzegovina", df$ArtistBio)
  df$ArtistBio = gsub("Indian", "India", df$ArtistBio)
  df$ArtistBio = gsub("scotland", "Scotland", df$ArtistBio)
  df$ArtistBio = gsub("To Zimbabwe", "", df$ArtistBio)
  df$ArtistBio = gsub("Great", "", df$ArtistBio)
  df$ArtistBio = gsub("in", "", df$ArtistBio)
  df$ArtistBio = gsub("Iranian", "Iran", df$ArtistBio)
  df$ArtistBio = gsub("Spa", "",df$ArtistBio)
  df$ArtistBio = gsub("America America", "America", df$ArtistBio)
  df$ArtistBio = gsub("America Austria", "America", df$ArtistBio)
  df$ArtistBio = gsub("America Korea", "America", df$ArtistBio)
  df$ArtistBio = gsub("AngloIrish", "Ireland", df$ArtistBio)
  df$ArtistBio = gsub("Argente", "", df$ArtistBio)
  df$ArtistBio = gsub("Britian Britian", "", df$ArtistBio)
  df$ArtistBio = gsub("Columbia", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Catalan", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Cambodian", "Cambodia", df$ArtistBio)
  df$ArtistBio = gsub("Cameroonian", "Cameroon",df$ArtistBio)
  df$ArtistBio = gsub("Cha", "", df$ArtistBio)
  df$ArtistBio = gsub("Ecuadorian", "Ecuador", df$ArtistBio)
  df$ArtistBio = gsub("Estonian", "Estonia", df$ArtistBio)
  df$ArtistBio = gsub("Fnish", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("Finland Finland", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("Ethiopian", "Ethiopia", df$ArtistBio)
  df$ArtistBio = gsub("Filipo America", "Philippines", df$ArtistBio)
  df$ArtistBio = gsub("Filipo", "Philippines", df$ArtistBio)
  df$ArtistBio = gsub("Fish", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("France Britian", "France", df$ArtistBio)
  df$ArtistBio = gsub("France France", "France", df$ArtistBio)
  df$ArtistBio = gsub("Fred Chich", "", df$ArtistBio)
  df$ArtistBio = gsub("Georgian", "", df$ArtistBio)
  df$ArtistBio = gsub("Jackson Center", "", df$ArtistBio)
  df$ArtistBio = gsub("JordanianLebaneseBritiann", "", df$ArtistBio)
  df$ArtistBio = gsub("Brita", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("Afghan", "Afghanistan", df$ArtistBio)
  df$ArtistBio = gsub("Albanian", "Albania", df$ArtistBio)
  df$ArtistBio = gsub("Algerian", "Algeria", df$ArtistBio)
  df$ArtistBio = gsub("Angolan", "Angola", df$ArtistBio)
  df$ArtistBio = gsub("HerzegovaCroatian", "Herzegova", df$ArtistBio)
  df$ArtistBio = gsub("Burkinabe", "Burkinabe", df$ArtistBio)
  df$ArtistBio = gsub("Chili", "Chile", df$ArtistBio)
  df$ArtistBio = gsub("Colombian", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Congolese", "DR Congo", df$ArtistBio)
  df$ArtistBio = gsub("Costa Rican", "Costa Rica", df$ArtistBio)
  df$ArtistBio = gsub("Croatian", "Croatia", df$ArtistBio)
  df$ArtistBio = gsub("Philippes", "", df$ArtistBio)
  df$ArtistBio = gsub("Fland", "", df$ArtistBio)
  df$ArtistBio = gsub("Icelandic", "Iceland", df$ArtistBio)
  df$ArtistBio = gsub("Indonesian", "Indonesia", df$ArtistBio)
  df$ArtistBio = gsub("naturalized America", "", df$ArtistBio)
  df$ArtistBio = gsub("and Israeli", "", df$ArtistBio)
  df$ArtistBio = gsub("Israeli", "Israel", df$ArtistBio)
  df$ArtistBio = gsub("Israel America", "Israel", df$ArtistBio)
  df$ArtistBio = gsub("Ivorian", "Cote d'ivoire", df$ArtistBio)
  df$ArtistBio = gsub("JapanAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("British", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("JordanianLebaneseBritian", "British", df$ArtistBio)
  df$ArtistBio = gsub("Kenyan", "Kenya", df$ArtistBio)
  df$ArtistBio = gsub("Kazakhstani", "Kazakhstan", df$ArtistBio)
  df$ArtistBio = gsub("Kuwaiti", "Kuwait", df$ArtistBio)
  df$ArtistBio = gsub("Kyrgyzstani", "Kyrgyzstan", df$ArtistBio)
  df$ArtistBio = gsub("Lebanese", "Lebanon", df$ArtistBio)
  df$ArtistBio = gsub("LebanonPalian", "Lebanon", df$ArtistBio)
  df$ArtistBio = gsub("Lithuanian", "Lithuania", df$ArtistBio)
  df$ArtistBio = gsub("Luxembourgish", "Luxembourg", df$ArtistBio)
  df$ArtistBio = gsub("Macedonian", "Macedonia", df$ArtistBio)
  df$ArtistBio = gsub("Mozambiquen", "Mozambique", df$ArtistBio)
  df$ArtistBio = gsub("Namibian", "Namibia", df$ArtistBio)
  df$ArtistBio = gsub("Netherlands", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("RussiaLithuania", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Sgapore", "Singapore", df$ArtistBio)
  df$ArtistBio = gsub("Slovak", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("Slovakiaian", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("Slovenian", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("former Yugoslaviaia", "", df$ArtistBio)
  df$ArtistBio = gsub("Swizerland Swizerland", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("UK", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("Britain Britain", "", df$ArtistBio)
  df$ArtistBio = gsub("America", "US", df$ArtistBio)

#create new data frame with clean data and replace with country name instead

  df5  =  data.frame(table(df$ArtistBio))
  names(df5) = c("country", "stock")

  df5$country = gsub("German", "Germany", df5$country)
  df5$country = gsub("German German German German German", "Germany", df5$country)
  df5$country = gsub("Germanyy","Germany", df5$country)
  df5$country = gsub("Canada Canada", "Canada", df5$country)
  df5$country = gsub("Czech Republic", "Czechoslovakia", df5$country)
  df5$country = gsub("Republic", "", df5$country)
  df5$country = gsub("USSR", "Russia", df5$country)
  df5$country = gsub(" now Slovenia", "", df5$country)
  df5$country = gsub("Yugoslaviaian", "Yugoslavia", df5$country)
  df5$country = gsub("Zimbabwean", "Zimbabwe", df5$country)
  df5$country = gsub("Ukranian", "Ukraine", df5$country)
  df5$country = gsub("Ukraian", "Ukraine", df5$country)
  df5$country = gsub("Slovakia", "Czechoslovakia", df5$country)
  df5$country = gsub("Scotland", "UK", df5$country)
  df5$country = gsub("Ukraian", "Ukraine", df5$country)
  df5$country = gsub("Panamanian", "Panama", df5$country)
  df5$country = gsub("Paraguayan", "Paraguay", df5$country)
  df5$country = gsub("Nicaraguan", "Nicaragua", df5$country)
  df5$country = gsub("Irish", "UK", df5$country)
  df5$country = gsub("Ireland","UK", df5$country)
  df5$country = gsub("Hungarian","Hungary", df5$country)
  df5$country = gsub("Guatemalan","Guatemala", df5$country)
  df5$country = gsub("Germany Poland","Germany", df5$country)
  df5$country = gsub("of Israel orig","", df5$country)
  df5$country = gsub("Germany Germany","Germany", df5$country)
  df5$country = gsub("Germany France","Germany", df5$country)
  df5$country = gsub("Germany Australia","Germany", df5$country)
  df5$country = gsub("Germany and France","Germany", df5$country)
  df5$country = gsub("France US", "France", df5$country)
  df5$country = gsub("DR Congo","Congo", df5$country)
  df5$country = gsub("Cuba Cuba","Cuba", df5$country)
  df5$country = gsub("Britian","UK", df5$country)
  df5$country = gsub("British","UK", df5$country)
  df5$country = gsub("Britain","UK", df5$country)
  df5$country = gsub("US","USA", df5$country)
  df5$country = gsub("USASR", "Russia", df5$country)
  df5$country = gsub("USSR", "Russia", df5$country)

#maping the different countires with the stock of artworks 
#open maps and ggplot2 from the library 

  library("ggplot2")
  library("maps")
  library("dplyr")

  map = map_data("world") #load the world map from the map data 

#merge the two data together 

  df.map = full_join(df5, map, by = c("country" = "region"))

#plotting or mapping the stock-variable 

  stock_map= ggplot(df.map, aes(x=long, y=lat, group=group, fill=stock))
  stock_map+geom_polygon()+ expand_limits(x = df.map$long, y = df.map$lat) + theme_minimal()
  df5$country[!unique(df5$country) %in% unique(map$region)] #check which countires cannot be graphed

#change scale of stock to log(stock), for more color variation (USA is outlier in original data)

  df6 = mutate(df5, logstock=log(stock))

#merge the two data together

  df.map_log = full_join(df6, map, by = c("country" = "region"))

#plotting or mapping the logstock-variable (results better visibility of differences, 
#but scale harder to interprete)

  stock_map = ggplot(df.map_log, aes(x = long, y = lat, group = group, fill = logstock))
  stock_map + geom_polygon() + expand_limits(x  = df.map$long, y = df.map$lat) + theme_minimal()
  df6$country[!unique(df6$country) %in% unique(map$region)] 

Question #8 ---------------

#prepatory
  library("readr")
  library("sqldf")
  library("caroline")
  library("plyr")
  library("stringr")

#objects cataloged as "Painting" are selected
  df8 = df[which(df$Classification == "Painting"),]

#cleanign database
  df8$Dimensions = ifelse(df8$ObjectID == "127893", "Framed (47.5 x 37.3 cm)", df8$Dimensions)
  df8$Dimensions = gsub("-", "x", df8$Dimensions)
  df8$Dimensions = gsub("-", "x", df8$Dimensions)
  df8$Dimensions = gsub("X", "x", df8$Dimensions)

#extracting size in centimeters ("square" artworks)
  df8$dim2 = str_extract_all(df8$Dimensions, "\\(\\d+\\.*\\d+\\s\\w\\s\\d+\\.*\\d+\\s+\\w\\w\\)")

#correcting database
  df8$dim2 = ifelse(df8$ObjectID == "80301", "(18.03 x 25.4 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79760", "(116.8 x 110.5 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79715", "(274.3 x 182.9 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79083", "(182.9 x 91.4 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "191856", "(108.9 x 222.9 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79462", "(185.4 x 1643.3 cm)", df8$dim2)
  df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)

#counting number of paintings that compose the artwork. obs: for n>2, the number of paintings is n-1
  q = df8$dim2
  w = "\\("
  q2 = gsub(w, "", q)
  df8$n = nchar(q) - nchar(q2)

# homogenizing the patterns
  df8$dim2 = gsub("[^[:digit:]x\\(\\)\\.]", " ", df8$dim2)
  df8$dim2 = gsub("\\s", "", df8$dim2)
  df8$dim2 = gsub("\\)\\(", "-", df8$dim2)
  df8$dim2 = gsub("\\(", "", df8$dim2)
  df8$dim2 = gsub("\\)", "", df8$dim2)
  df8 = data.frame(df8[order(df8$n), ])
  table(df8$n)

# calculating areas.
#1? separate in different dataframes by number of paintings of each artwork.
#2? extarct each measure of each painting 
#3? calculate area of each painting
#4? the area of each artwork is caculated as the sum of the areas of the painting that compose it.

  df81 = df8[which(df8$n == 1), ]
  medida1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df81$dim2))
  medida2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df81$dim2))
  area = medida1*medida2
  df81$area = area

    df82 = df8[which(df8$n == 3), ]
    medida1_p1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df82$dim2))
    medida2_p1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df82$dim2))
    medida1_p2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df82$dim2))
    medida2_p2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df82$dim2))
    areac1 = medida1_p1*medida2_p1
    areac2 = medida1_p2*medida2_p2
    areac12 = areac1 + areac2
    df82$area = areac12

    df83 = df8[which(df8$n == 4), ]
    medida1_p1a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df83$dim2))
    medida2_p1a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df83$dim2))
    medida1_p2a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df83$dim2))
    medida2_p2a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df83$dim2))
    medida1_p3a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df83$dim2))
    medida2_p3a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df83$dim2))
    areaca1 = medida1_p1a*medida2_p1a
    areaca2 = medida1_p2a*medida2_p2a
    areaca3 = medida1_p3a*medida2_p3a
    areac123 = areaca1 + areaca2 + areaca3
    df83$area = areac123

    df84 = df8[which(df8$n == 5), ]
    medida1_p1b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df84$dim2))
    medida2_p1b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df84$dim2))
    medida1_p2b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df84$dim2))
    medida2_p2b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df84$dim2))
    medida1_p3b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df84$dim2))
    medida2_p3b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df84$dim2))
    medida1_p4b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df84$dim2))
    medida2_p4b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df84$dim2))
    areacb1 = medida1_p1b*medida2_p1b
    areacb2 = medida1_p2b*medida2_p2b
    areacb3 = medida1_p3b*medida2_p3b
    areacb4 = medida1_p4b*medida2_p4b
    areac1234 = areacb1 + areacb2 + areacb3 + areacb4
    df84$area = areac1234

    df85 = df8[which(df8$n == 6), ]
    medida1_p1c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df85$dim2))
    medida2_p1c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df85$dim2))
    medida1_p2c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df85$dim2))
    medida2_p2c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df85$dim2))
    medida1_p3c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df85$dim2))
    medida2_p3c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df85$dim2))
    medida1_p4c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df85$dim2))
    medida2_p4c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df85$dim2))
    medida1_p5c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df85$dim2))
    medida2_p5c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df85$dim2))
    areacc1 = medida1_p1c*medida2_p1c
    areacc2 = medida1_p2c*medida2_p2c
    areacc3 = medida1_p3c*medida2_p3c
    areacc4 = medida1_p4c*medida2_p4c
    areacc5 = medida1_p5c*medida2_p5c
    areac12345 = areacc1 + areacc2 + areacc3 + areacc4 + areacc5
    df85$area = areac12345

    df86 = df8[which(df8$n == 7), ]
    medida1_p1d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df86$dim2))
    medida2_p1d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df86$dim2))
    medida1_p2d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df86$dim2))
    medida2_p2d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df86$dim2))
    medida1_p3d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df86$dim2))
    medida2_p3d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df86$dim2))
    medida1_p4d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df86$dim2))
    medida2_p4d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df86$dim2))
    medida1_p5d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida2_p5d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida1_p6d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+-\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida2_p6d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+-\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    areacd1 = medida1_p1d*medida2_p1d
    areacd2 = medida1_p2d*medida2_p2d
    areacd3 = medida1_p3d*medida2_p3d
    areacd4 = medida1_p4d*medida2_p4d
    areacd5 = medida1_p5d*medida2_p5d
    areacd6 = medida1_p6d*medida2_p6d
    areac123456 = areacd1+areacd2+areacd3+areacd4+areacd5+areacd6
    df86$area<-areac123456

  # extracting size in centimeters ("circular" artworks)
    df8$dim2<-str_extract_all(df8$Dimensions, "\\(\\d+\\.*\\d+\\s\\w\\w\\)")

  # cleaning data
    df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)

  # counting number of paintings that compose the artwork. obs: for n>2,  the number of paintings is n-1
    q = df8$dim2
    w = "\\("
    q2 = gsub(w,  "", q)

df8$n = nchar(q) - nchar(q2)

  # correcting data
    df8$n = ifelse(df8$ObjectID == "79582",  0,  df8$n)
    df8$n = ifelse(df8$ObjectID == "79583",  0,  df8$n)
    df8$n = ifelse(df8$ObjectID == "79462",  0,  df8$n)

  # homogenizing the patterns
    df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)
    df8$dim2 = gsub("[^[:digit:]x\\(\\)\\.]", " ", df8$dim2)
    df8$dim2 = gsub("\\s", "", df8$dim2)
    df8$dim2 = gsub("\\)\\(", "-", df8$dim2)
    df8$dim2 = gsub("\\(", "", df8$dim2)
    df8$dim2 = gsub("\\)", "", df8$dim2)

  # calculating areas.
  #1? separate in different dataframes by number of paintings of each artwork.
  #2? extarct each measure of each painting 
  #3? calculate area of each painting
  #4? the area of each artwork is caculated as the sum of the areas of the painting that compose it.

    dfc81 = df8[which(df8$n == 1), ]
    radio = as.numeric(dfc81$dim2)
    carea = radio*radio*3.14
    dfc81$area = carea

    dfc82 = df8[which(df8$n == 3), ]
    radio1 = as.numeric(sub("(\\d+\\.*\\d+)-(\\d+\\.*\\d+)", "\\1",  dfc82$dim2))
    radio2 = as.numeric(sub("(\\d+\\.*\\d+)-(\\d+\\.*\\d+)", "\\2",  dfc82$dim2))
    carea1 = radio1*radio1*3.14
    carea2 = radio2*radio2*3.14
    dfc82$area = carea1+carea2

 # unifying into a single database.
    dfarea = data.frame(rbind(df81, df82, df83, df84, df85, df86, dfc81, dfc82))

  # corecting "n",  so it indicates number of painting by artwork
    dfarea$n =ifelse(dfarea$n > 2, dfarea$n-1, dfarea$n)

  # sort data by "area"
    dfarea = data.frame(dfarea[order(dfarea$area), ])

  # final result.
    dfareah5l5 = data.frame(rbind(head(dfarea,  5), tail(dfarea, 5)))
    dfareah5l5
sebastianbarfort commented 9 years ago

Ok assignment.

For plotting time series data, consider using line plots instead of histograms.

Your approach in question 7 is perhaps not the best way to approach the problem, see my solution for inspiration.

Your regular expressions in question 8 are very inefficient, see my solution instead :)

APPROVED