Question #1 ---------------

read the Data Set

  library("readr")

  df = read_csv("https://raw.githubusercontent.com/MuseumofModernArt/collection/master/Artworks.csv")
  names(df)
  head(df)

to prepare the Data, need to extract the month from the date

  library("lubridate")
  df$month = month(df$DateAcquired,  label=TRUE)

create a new data frame by monthly stock

  df2 = data.frame(table(df$month))

naming the variables

  names(df2) = c("Month",  "Stock")

Question #2 --------

plot the graph

We believe that bar graph will be the most suitable for this data because we are ploting one variable against the frequency

  library(ggplot2)

  p = ggplot(data = df2, aes(x = Month, y = Stock))

we want the graph to be histogram (explain reson from above) stat="idenity" to let R know that it is a numerical, colour/fill to be RED, then we table the axis and give it a title and theme to be minimal

  p = p + geom_histogram(stat = "identity", color = "red", fill = "red") + theme_minimal() + labs(x ="Month", y = "The Stock of Painting", title = "The Stock of Paintings at MOMA for Each Month")
  p

Question #3 ------------

  df$month = month(df$DateAcquired, label = TRUE)
  df3 = data.frame(table(df$month, df$CuratorApproved))
  colnames(df3) = c("Month", "CuratorApproved", "Quantity")
  ggplot(df3, aes(x = Month, y = Quantity, fill = CuratorApproved )) + geom_histogram(stat = "identity")

or

  ggplot(df3, aes(x = Month, y = Quantity)) + geom_histogram(stat = "identity", aes(fill = factor(CuratorApproved))) + facet_wrap(~CuratorApproved)

Question #4 -----------

creating a new data frame

   df3 = data.frame(table(df$Department, df$month))

naming the variables

  names(df3) = c("Department", "Month", "Stock")

Question #5 ----------

#plotting the graphs 

#monthly increase for each department:
  p3 = ggplot(data = df3, aes(x = Month, y = Stock))
  p3 + geom_histogram(stat = "identity", color = "red", fill = "red") + facet_wrap( ~ Department)

#increase by department, seperated for every month:
  Ptry = ggplot(data = df3, aes( x = Department, y = Stock))
  Ptry + geom_histogram(stat = "identity", colour = "red", fill = "red") +facet_wrap( ~ Month)

 #general stock increase
  Ptry2 = ggplot(data = df3,aes(x = Department, y = Stock))
  Ptry2 + geom_histogram(stat = "identity", colour = "red", fill = "red")

Question #6 ------------

#make new data frame
  df4 = data.frame(table(df$Artist))
  names(df4)=c("Artist", "Stock")

#order by stock
  list = order(df4$Stock, decreasing = T)
  df4 = df4[list,]

#show top 12 of the Artist by stock
  df4_top12 = data.frame(df4$Artist[1:12], df4$Stock[1:12])

#delete "unknowns" or "empties"; yields Top10
  df4_top10 = df4_top12[-c(4,6),]
  names(df4_top10 ) = c("Artist", "Stock")
#result (has row numbering from previous top 12)
  df4_top10

Question #7 -------------

#clean up the data, by removing special character and renaming nationality into countries

  library(stringr)

  df$ArtistBio = gsub(",.*$", "", df$ArtistBio)
  df$ArtistBio = substr(df$ArtistBio, 2, nchar(df$ArtistBio))
  df$ArtistBio = gsub("and Swiss", "", df$ArtistBio)
  df$ArtistBio = gsub("", "", df$ArtistBio)
  df$ArtistBio = gsub("[1-9]","",df$ArtistBio)
  df$ArtistBio = str_replace_all(df$ArtistBio, "[:punct:]", "")
  df$ArtistBio = str_replace_all(df$ArtistBio, "[:digit:]", "")
  df$ArtistBio = gsub("Various", "", df$ArtistBio)
  df$ArtistBio = gsub("est", "", df$ArtistBio)
  df$ArtistBio = gsub("founded", "", df$ArtistBio)
  df$ArtistBio = gsub("ablished", "", df$ArtistBio)
  df$ArtistBio = gsub("born", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality unknown Nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Russian Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Russia Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("RussiaLithuanian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Russian", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Saudi Arabian", "Saudi Arabia", df$ArtistBio)
  df$ArtistBio = gsub("American", "America", df$ArtistBio)
  df$ArtistBio = gsub("French", "France", df$ArtistBio)
  df$ArtistBio = gsub("Serbian", "Serbia", df$ArtistBio)
  df$ArtistBio = gsub("Romanian", "Romania", df$ArtistBio)
  df$ArtistBio = gsub("active", "", df$ArtistBio)
  df$ArtistBio = gsub("Active", "", df$ArtistBio)
  df$ArtistBio = gsub("Scottish", "Scotland", df$ArtistBio)
  df$ArtistBio = gsub("and British", "", df$ArtistBio)
  df$ArtistBio = gsub("and Canadian", "", df$ArtistBio)
  df$ArtistBio = gsub("and Mexican", "", df$ArtistBio)
  df$ArtistBio = gsub("and Swedish", "", df$ArtistBio)
  df$ArtistBio = gsub("and Danish", "", df$ArtistBio)
  df$ArtistBio = gsub("Ghanaian", "Ghana", df$ArtistBio)
  df$ArtistBio = gsub("Italian", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Italy Italy", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Japanese", "Japan",df$ArtistBio )
  df$ArtistBio = gsub("JapaneseAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("Japan Japan", "Japan", df$ArtistBio)
  df$ArtistBio = gsub("Korean", "Korea", df$ArtistBio)
  df$ArtistBio = gsub("Malaysian", "Malaysia", df$ArtistBio)
  df$ArtistBio = gsub("Mexican", "Mexico",df$ArtistBio )
  df$ArtistBio = gsub("Mexico Mexico", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Ghanaian", "Ghana", df$ArtistBio)
  df$ArtistBio = gsub("Italian", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Italy Italy", "Italy", df$ArtistBio)
  df$ArtistBio = gsub("Japanese", "Japan",df$ArtistBio) 
  df$ArtistBio = gsub("JapaneseAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("Japan Japan", "Japan", df$ArtistBio)
  df$ArtistBio = gsub("Korean", "South Korea", df$ArtistBio)
  df$ArtistBio = gsub("Malaysian", "Malaysia", df$ArtistBio)
  df$ArtistBio = gsub("Mexican", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("Mexico Mexico", "Mexico", df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("nationality unknown", "", df$ArtistBio)
  df$ArtistBio = gsub("Nationality Unknown", "",df$ArtistBio)
  df$ArtistBio = gsub("Spanish", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Portuguese", "Portugal",df$ArtistBio)
  df$ArtistBio = gsub("British", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("USA", "America", df$ArtistBio)
  df$ArtistBio = gsub("Native America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Osaka", "Japan",df$ArtistBio)
  df$ArtistBio = gsub("Norwegian", "Norway", df$ArtistBio)
  df$ArtistBio = gsub("New Zealander", "New Zealand", df$ArtistBio)
  df$ArtistBio = gsub("Australian ", "Australia", df$ArtistBio)
  df$ArtistBio = gsub("Canadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Austrian", "Austria", df$ArtistBio)
  df$ArtistBio = gsub("Nowegian", "Norway", df$ArtistBio)
  df$ArtistBio = gsub("November", "", df$ArtistBio)
  df$ArtistBio = gsub("Pakistani", "Pakistan", df$ArtistBio)
  df$ArtistBio = gsub("Polish", "Poland",df$ArtistBio)
  df$ArtistBio = gsub("Peruvian", "Peru",df$ArtistBio)
  df$ArtistBio = gsub("Poland Poland", "Poland",df$ArtistBio)
  df$ArtistBio = gsub("MexicoCanadian", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Senegalese", "Senegal", df$ArtistBio)
  df$ArtistBio = gsub("Singaporean", "Singapore", df$ArtistBio)
  df$ArtistBio = gsub("artists group", "", df$ArtistBio)
  df$ArtistBio = gsub("South African", "South Africa", df$ArtistBio)
  df$ArtistBio = gsub("Sudanese", "Sudan", df$ArtistBio)
  df$ArtistBio = gsub("Swedish", "Sweden", df$ArtistBio)
  df$ArtistBio = gsub("Swizerland Swizerland", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("Swiss", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("Syrian", "Syria", df$ArtistBio)
  df$ArtistBio = gsub("Taiwanese", "Taiwan", df$ArtistBio)
  df$ArtistBio = gsub("The Netherlands", "Netherlands", df$ArtistBio)
  df$ArtistBio = gsub("Thai", "Thailand", df$ArtistBio)
  df$ArtistBio = gsub("Tajik", "Tajikistan", df$ArtistBio)
  df$ArtistBio = gsub("Tanzanian", "Tanzania", df$ArtistBio)
  df$ArtistBio = gsub("Tunisian", "Tunisia", df$ArtistBio)
  df$ArtistBio = gsub("Turkish", "Turkey", df$ArtistBio)
  df$ArtistBio = gsub("and America", "", df$ArtistBio)
  df$ArtistBio = gsub("Britian Britian", "", df$ArtistBio)
  df$ArtistBio = gsub("America Germany", "America", df$ArtistBio)
  df$ArtistBio = gsub("America America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Britian France", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Belgian", "Belgium", df$ArtistBio)
  df$ArtistBio = gsub("British", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Brazilian", "Brazil", df$ArtistBio)
  df$ArtistBio = gsub("Argentinian", "Argentina", df$ArtistBio)
  df$ArtistBio = gsub("FranceMoroccan", "France", df$ArtistBio)
  df$ArtistBio = gsub("Latvian", "Latvia", df$ArtistBio)
  df$ArtistBio = gsub("Venezuelan", "Venezuela", df$ArtistBio)
  df$ArtistBio = gsub("Venezuela Venezuela", "Venezuela", df$ArtistBio)
  df$ArtistBio = gsub("Vietnamese", "Vietnam", df$ArtistBio)
  df$ArtistBio = gsub("Vietnam America", "America", df$ArtistBio)
  df$ArtistBio = gsub("Welsh", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Yugoslav", "Yugoslavia", df$ArtistBio)
  df$ArtistBio = gsub("now Croatia", "", dfartistbio$country)
  df$ArtistBio = gsub("now Slovenia", "", dfartistbio$country)
  df$ArtistBio = gsub("Yugoslaviaian", "Yugoslavia", dfartistbio$country)
  df$ArtistBio = gsub("Zagreb", "Croatia", dfartistbio$country)
  df$ArtistBio = gsub("Zimbabwean", "Zimbabwe", dfartistbio$country)
  df$ArtistBio = gsub(" then Southern Rhodesia", "", df$ArtistBio)
  df$ArtistBio = gsub("Germany Germany Germany", "Germany", df$ArtistBio)
  df$ArtistBio = gsub("Moroccan", "Morocco", df$ArtistBio)
  df$ArtistBio = gsub("Mozambica", "Mozambique", df$ArtistBio)
  df$ArtistBio = gsub("Malian", "Mali", df$ArtistBio)
  df$ArtistBio = gsub("Mauritanian", "Mauritania", df$ArtistBio)
  df$ArtistBio = gsub("Egyptian", "Eqypt", df$ArtistBio)
  df$ArtistBio = gsub("English", "Britian", df$ArtistBio)
  df$ArtistBio = gsub("Dutch", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("Netherland Netherland", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("and Icelandic", "", df$ArtistBio)
  df$ArtistBio = gsub("Danish", "Denmark", df$ArtistBio)
  df$ArtistBio = gsub("Czech Czech", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech Republic RepublicRepublic", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Czech Republicoslovakian", "Czech Republic", df$ArtistBio)
  df$ArtistBio = gsub("Cuban", "Cuba", df$ArtistBio)
  df$ArtistBio = gsub("Cuba Cuba", "Cuba", df$ArtistBio)
  df$ArtistBio = gsub("Colombian", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Columbian ", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Chilean", "Chili", df$ArtistBio)
  df$ArtistBio = gsub("Chinese", "China", df$ArtistBio)
  df$ArtistBio = gsub("Inuit", "Canada", df$ArtistBio)
  df$ArtistBio = gsub("Bulgarian", "Bulgria", df$ArtistBio)
  df$ArtistBio = gsub("Britian America", "Britian",df$ArtistBio)
  df$ArtistBio = gsub("of Palinian origin", "", df$ArtistBio)
  df$ArtistBio = gsub("and Iranian", "", df$ArtistBio)
  df$ArtistBio = gsub("Barcelona", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Bolivian", "Bolivia", df$ArtistBio)
  df$ArtistBio = gsub("Azerbaijani", "Azerbaijan", df$ArtistBio)
  df$ArtistBio = gsub("Australian", "Australia", df$ArtistBio)
  df$ArtistBio = gsub("Bahamian", "Bahamas", df$ArtistBio)
  df$ArtistBio = gsub("Great Britain To Zimbabwe", "Britain",df$ArtistBio)
  df$ArtistBio = gsub("Bosnian", "Bosnia and Herzegovina", df$ArtistBio)
  df$ArtistBio = gsub("bosnia and herzegovinaCroatian", "Bosnia and Herzegovina",df$ArtistBio)
  df$ArtistBio = gsub("bosnia and herzegovina", "Bosnia and Herzegovina", df$ArtistBio)
  df$ArtistBio = gsub("Indian", "India", df$ArtistBio)
  df$ArtistBio = gsub("scotland", "Scotland", df$ArtistBio)
  df$ArtistBio = gsub("To Zimbabwe", "", df$ArtistBio)
  df$ArtistBio = gsub("Great", "", df$ArtistBio)
  df$ArtistBio = gsub("in", "", df$ArtistBio)
  df$ArtistBio = gsub("Iranian", "Iran", df$ArtistBio)
  df$ArtistBio = gsub("Spa", "",df$ArtistBio)
  df$ArtistBio = gsub("America America", "America", df$ArtistBio)
  df$ArtistBio = gsub("America Austria", "America", df$ArtistBio)
  df$ArtistBio = gsub("America Korea", "America", df$ArtistBio)
  df$ArtistBio = gsub("AngloIrish", "Ireland", df$ArtistBio)
  df$ArtistBio = gsub("Argente", "", df$ArtistBio)
  df$ArtistBio = gsub("Britian Britian", "", df$ArtistBio)
  df$ArtistBio = gsub("Columbia", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Catalan", "Spain", df$ArtistBio)
  df$ArtistBio = gsub("Cambodian", "Cambodia", df$ArtistBio)
  df$ArtistBio = gsub("Cameroonian", "Cameroon",df$ArtistBio)
  df$ArtistBio = gsub("Cha", "", df$ArtistBio)
  df$ArtistBio = gsub("Ecuadorian", "Ecuador", df$ArtistBio)
  df$ArtistBio = gsub("Estonian", "Estonia", df$ArtistBio)
  df$ArtistBio = gsub("Fnish", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("Finland Finland", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("Ethiopian", "Ethiopia", df$ArtistBio)
  df$ArtistBio = gsub("Filipo America", "Philippines", df$ArtistBio)
  df$ArtistBio = gsub("Filipo", "Philippines", df$ArtistBio)
  df$ArtistBio = gsub("Fish", "Finland", df$ArtistBio)
  df$ArtistBio = gsub("France Britian", "France", df$ArtistBio)
  df$ArtistBio = gsub("France France", "France", df$ArtistBio)
  df$ArtistBio = gsub("Fred Chich", "", df$ArtistBio)
  df$ArtistBio = gsub("Georgian", "", df$ArtistBio)
  df$ArtistBio = gsub("Jackson Center", "", df$ArtistBio)
  df$ArtistBio = gsub("JordanianLebaneseBritiann", "", df$ArtistBio)
  df$ArtistBio = gsub("Brita", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("Afghan", "Afghanistan", df$ArtistBio)
  df$ArtistBio = gsub("Albanian", "Albania", df$ArtistBio)
  df$ArtistBio = gsub("Algerian", "Algeria", df$ArtistBio)
  df$ArtistBio = gsub("Angolan", "Angola", df$ArtistBio)
  df$ArtistBio = gsub("HerzegovaCroatian", "Herzegova", df$ArtistBio)
  df$ArtistBio = gsub("Burkinabe", "Burkinabe", df$ArtistBio)
  df$ArtistBio = gsub("Chili", "Chile", df$ArtistBio)
  df$ArtistBio = gsub("Colombian", "Colombia", df$ArtistBio)
  df$ArtistBio = gsub("Congolese", "DR Congo", df$ArtistBio)
  df$ArtistBio = gsub("Costa Rican", "Costa Rica", df$ArtistBio)
  df$ArtistBio = gsub("Croatian", "Croatia", df$ArtistBio)
  df$ArtistBio = gsub("Philippes", "", df$ArtistBio)
  df$ArtistBio = gsub("Fland", "", df$ArtistBio)
  df$ArtistBio = gsub("Icelandic", "Iceland", df$ArtistBio)
  df$ArtistBio = gsub("Indonesian", "Indonesia", df$ArtistBio)
  df$ArtistBio = gsub("naturalized America", "", df$ArtistBio)
  df$ArtistBio = gsub("and Israeli", "", df$ArtistBio)
  df$ArtistBio = gsub("Israeli", "Israel", df$ArtistBio)
  df$ArtistBio = gsub("Israel America", "Israel", df$ArtistBio)
  df$ArtistBio = gsub("Ivorian", "Cote d'ivoire", df$ArtistBio)
  df$ArtistBio = gsub("JapanAmerica", "America", df$ArtistBio)
  df$ArtistBio = gsub("British", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("JordanianLebaneseBritian", "British", df$ArtistBio)
  df$ArtistBio = gsub("Kenyan", "Kenya", df$ArtistBio)
  df$ArtistBio = gsub("Kazakhstani", "Kazakhstan", df$ArtistBio)
  df$ArtistBio = gsub("Kuwaiti", "Kuwait", df$ArtistBio)
  df$ArtistBio = gsub("Kyrgyzstani", "Kyrgyzstan", df$ArtistBio)
  df$ArtistBio = gsub("Lebanese", "Lebanon", df$ArtistBio)
  df$ArtistBio = gsub("LebanonPalian", "Lebanon", df$ArtistBio)
  df$ArtistBio = gsub("Lithuanian", "Lithuania", df$ArtistBio)
  df$ArtistBio = gsub("Luxembourgish", "Luxembourg", df$ArtistBio)
  df$ArtistBio = gsub("Macedonian", "Macedonia", df$ArtistBio)
  df$ArtistBio = gsub("Mozambiquen", "Mozambique", df$ArtistBio)
  df$ArtistBio = gsub("Namibian", "Namibia", df$ArtistBio)
  df$ArtistBio = gsub("Netherlands", "Netherland", df$ArtistBio)
  df$ArtistBio = gsub("RussiaLithuania", "Russia", df$ArtistBio)
  df$ArtistBio = gsub("Sgapore", "Singapore", df$ArtistBio)
  df$ArtistBio = gsub("Slovak", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("Slovakiaian", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("Slovenian", "Slovakia", df$ArtistBio)
  df$ArtistBio = gsub("former Yugoslaviaia", "", df$ArtistBio)
  df$ArtistBio = gsub("Swizerland Swizerland", "Swizerland", df$ArtistBio)
  df$ArtistBio = gsub("UK", "Britain", df$ArtistBio)
  df$ArtistBio = gsub("Britain Britain", "", df$ArtistBio)
  df$ArtistBio = gsub("America", "US", df$ArtistBio)

#create new data frame with clean data and replace with country name instead

  df5  =  data.frame(table(df$ArtistBio))
  names(df5) = c("country", "stock")

  df5$country = gsub("German", "Germany", df5$country)
  df5$country = gsub("German German German German German", "Germany", df5$country)
  df5$country = gsub("Germanyy","Germany", df5$country)
  df5$country = gsub("Canada Canada", "Canada", df5$country)
  df5$country = gsub("Czech Republic", "Czechoslovakia", df5$country)
  df5$country = gsub("Republic", "", df5$country)
  df5$country = gsub("USSR", "Russia", df5$country)
  df5$country = gsub(" now Slovenia", "", df5$country)
  df5$country = gsub("Yugoslaviaian", "Yugoslavia", df5$country)
  df5$country = gsub("Zimbabwean", "Zimbabwe", df5$country)
  df5$country = gsub("Ukranian", "Ukraine", df5$country)
  df5$country = gsub("Ukraian", "Ukraine", df5$country)
  df5$country = gsub("Slovakia", "Czechoslovakia", df5$country)
  df5$country = gsub("Scotland", "UK", df5$country)
  df5$country = gsub("Ukraian", "Ukraine", df5$country)
  df5$country = gsub("Panamanian", "Panama", df5$country)
  df5$country = gsub("Paraguayan", "Paraguay", df5$country)
  df5$country = gsub("Nicaraguan", "Nicaragua", df5$country)
  df5$country = gsub("Irish", "UK", df5$country)
  df5$country = gsub("Ireland","UK", df5$country)
  df5$country = gsub("Hungarian","Hungary", df5$country)
  df5$country = gsub("Guatemalan","Guatemala", df5$country)
  df5$country = gsub("Germany Poland","Germany", df5$country)
  df5$country = gsub("of Israel orig","", df5$country)
  df5$country = gsub("Germany Germany","Germany", df5$country)
  df5$country = gsub("Germany France","Germany", df5$country)
  df5$country = gsub("Germany Australia","Germany", df5$country)
  df5$country = gsub("Germany and France","Germany", df5$country)
  df5$country = gsub("France US", "France", df5$country)
  df5$country = gsub("DR Congo","Congo", df5$country)
  df5$country = gsub("Cuba Cuba","Cuba", df5$country)
  df5$country = gsub("Britian","UK", df5$country)
  df5$country = gsub("British","UK", df5$country)
  df5$country = gsub("Britain","UK", df5$country)
  df5$country = gsub("US","USA", df5$country)
  df5$country = gsub("USASR", "Russia", df5$country)
  df5$country = gsub("USSR", "Russia", df5$country)

#maping the different countires with the stock of artworks 
#open maps and ggplot2 from the library 

  library("ggplot2")
  library("maps")
  library("dplyr")

  map = map_data("world") #load the world map from the map data 

#merge the two data together 

  df.map = full_join(df5, map, by = c("country" = "region"))

#plotting or mapping the stock-variable 

  stock_map= ggplot(df.map, aes(x=long, y=lat, group=group, fill=stock))
  stock_map+geom_polygon()+ expand_limits(x = df.map$long, y = df.map$lat) + theme_minimal()
  df5$country[!unique(df5$country) %in% unique(map$region)] #check which countires cannot be graphed

#change scale of stock to log(stock), for more color variation (USA is outlier in original data)

  df6 = mutate(df5, logstock=log(stock))

#merge the two data together

  df.map_log = full_join(df6, map, by = c("country" = "region"))

#plotting or mapping the logstock-variable (results better visibility of differences, 
#but scale harder to interprete)

  stock_map = ggplot(df.map_log, aes(x = long, y = lat, group = group, fill = logstock))
  stock_map + geom_polygon() + expand_limits(x  = df.map$long, y = df.map$lat) + theme_minimal()
  df6$country[!unique(df6$country) %in% unique(map$region)]

Question #8 ---------------

#prepatory
  library("readr")
  library("sqldf")
  library("caroline")
  library("plyr")
  library("stringr")

#objects cataloged as "Painting" are selected
  df8 = df[which(df$Classification == "Painting"),]

#cleanign database
  df8$Dimensions = ifelse(df8$ObjectID == "127893", "Framed (47.5 x 37.3 cm)", df8$Dimensions)
  df8$Dimensions = gsub("-", "x", df8$Dimensions)
  df8$Dimensions = gsub("-", "x", df8$Dimensions)
  df8$Dimensions = gsub("X", "x", df8$Dimensions)

#extracting size in centimeters ("square" artworks)
  df8$dim2 = str_extract_all(df8$Dimensions, "\\(\\d+\\.*\\d+\\s\\w\\s\\d+\\.*\\d+\\s+\\w\\w\\)")

#correcting database
  df8$dim2 = ifelse(df8$ObjectID == "80301", "(18.03 x 25.4 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79760", "(116.8 x 110.5 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79715", "(274.3 x 182.9 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79083", "(182.9 x 91.4 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "191856", "(108.9 x 222.9 cm)", df8$dim2)
  df8$dim2 = ifelse(df8$ObjectID == "79462", "(185.4 x 1643.3 cm)", df8$dim2)
  df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)

#counting number of paintings that compose the artwork. obs: for n>2, the number of paintings is n-1
  q = df8$dim2
  w = "\\("
  q2 = gsub(w, "", q)
  df8$n = nchar(q) - nchar(q2)

# homogenizing the patterns
  df8$dim2 = gsub("[^[:digit:]x\\(\\)\\.]", " ", df8$dim2)
  df8$dim2 = gsub("\\s", "", df8$dim2)
  df8$dim2 = gsub("\\)\\(", "-", df8$dim2)
  df8$dim2 = gsub("\\(", "", df8$dim2)
  df8$dim2 = gsub("\\)", "", df8$dim2)
  df8 = data.frame(df8[order(df8$n), ])
  table(df8$n)

# calculating areas.
#1? separate in different dataframes by number of paintings of each artwork.
#2? extarct each measure of each painting 
#3? calculate area of each painting
#4? the area of each artwork is caculated as the sum of the areas of the painting that compose it.

  df81 = df8[which(df8$n == 1), ]
  medida1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df81$dim2))
  medida2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df81$dim2))
  area = medida1*medida2
  df81$area = area

    df82 = df8[which(df8$n == 3), ]
    medida1_p1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df82$dim2))
    medida2_p1 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df82$dim2))
    medida1_p2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df82$dim2))
    medida2_p2 = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df82$dim2))
    areac1 = medida1_p1*medida2_p1
    areac2 = medida1_p2*medida2_p2
    areac12 = areac1 + areac2
    df82$area = areac12

    df83 = df8[which(df8$n == 4), ]
    medida1_p1a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df83$dim2))
    medida2_p1a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df83$dim2))
    medida1_p2a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df83$dim2))
    medida2_p2a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df83$dim2))
    medida1_p3a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df83$dim2))
    medida2_p3a = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df83$dim2))
    areaca1 = medida1_p1a*medida2_p1a
    areaca2 = medida1_p2a*medida2_p2a
    areaca3 = medida1_p3a*medida2_p3a
    areac123 = areaca1 + areaca2 + areaca3
    df83$area = areac123

    df84 = df8[which(df8$n == 5), ]
    medida1_p1b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df84$dim2))
    medida2_p1b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df84$dim2))
    medida1_p2b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df84$dim2))
    medida2_p2b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df84$dim2))
    medida1_p3b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df84$dim2))
    medida2_p3b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df84$dim2))
    medida1_p4b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df84$dim2))
    medida2_p4b = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df84$dim2))
    areacb1 = medida1_p1b*medida2_p1b
    areacb2 = medida1_p2b*medida2_p2b
    areacb3 = medida1_p3b*medida2_p3b
    areacb4 = medida1_p4b*medida2_p4b
    areac1234 = areacb1 + areacb2 + areacb3 + areacb4
    df84$area = areac1234

    df85 = df8[which(df8$n == 6), ]
    medida1_p1c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df85$dim2))
    medida2_p1c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df85$dim2))
    medida1_p2c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df85$dim2))
    medida2_p2c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df85$dim2))
    medida1_p3c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df85$dim2))
    medida2_p3c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df85$dim2))
    medida1_p4c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df85$dim2))
    medida2_p4c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df85$dim2))
    medida1_p5c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df85$dim2))
    medida2_p5c = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df85$dim2))
    areacc1 = medida1_p1c*medida2_p1c
    areacc2 = medida1_p2c*medida2_p2c
    areacc3 = medida1_p3c*medida2_p3c
    areacc4 = medida1_p4c*medida2_p4c
    areacc5 = medida1_p5c*medida2_p5c
    areac12345 = areacc1 + areacc2 + areacc3 + areacc4 + areacc5
    df85$area = areac12345

    df86 = df8[which(df8$n == 7), ]
    medida1_p1d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\1",  df86$dim2))
    medida2_p1d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\2",  df86$dim2))
    medida1_p2d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\3",  df86$dim2))
    medida2_p2d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\4",  df86$dim2))
    medida1_p3d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\5",  df86$dim2))
    medida2_p3d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\6",  df86$dim2))
    medida1_p4d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\7",  df86$dim2))
    medida2_p4d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\8",  df86$dim2))
    medida1_p5d = as.numeric(sub("(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida2_p5d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida1_p6d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+-\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    medida2_p6d = as.numeric(sub("(\\d+\\.*\\d+x\\d+\\.*\\d+-\\d+\\.*\\d+x\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)-(\\d+\\.*\\d+)x(\\d+\\.*\\d+)", "\\9",  df86$dim2))
    areacd1 = medida1_p1d*medida2_p1d
    areacd2 = medida1_p2d*medida2_p2d
    areacd3 = medida1_p3d*medida2_p3d
    areacd4 = medida1_p4d*medida2_p4d
    areacd5 = medida1_p5d*medida2_p5d
    areacd6 = medida1_p6d*medida2_p6d
    areac123456 = areacd1+areacd2+areacd3+areacd4+areacd5+areacd6
    df86$area<-areac123456

  # extracting size in centimeters ("circular" artworks)
    df8$dim2<-str_extract_all(df8$Dimensions, "\\(\\d+\\.*\\d+\\s\\w\\w\\)")

  # cleaning data
    df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)

  # counting number of paintings that compose the artwork. obs: for n>2,  the number of paintings is n-1
    q = df8$dim2
    w = "\\("
    q2 = gsub(w,  "", q)

df8$n = nchar(q) - nchar(q2)

  # correcting data
    df8$n = ifelse(df8$ObjectID == "79582",  0,  df8$n)
    df8$n = ifelse(df8$ObjectID == "79583",  0,  df8$n)
    df8$n = ifelse(df8$ObjectID == "79462",  0,  df8$n)

  # homogenizing the patterns
    df8$dim2 = gsub("\\w+\\(\\d\\)", "", df8$dim2)
    df8$dim2 = gsub("[^[:digit:]x\\(\\)\\.]", " ", df8$dim2)
    df8$dim2 = gsub("\\s", "", df8$dim2)
    df8$dim2 = gsub("\\)\\(", "-", df8$dim2)
    df8$dim2 = gsub("\\(", "", df8$dim2)
    df8$dim2 = gsub("\\)", "", df8$dim2)

  # calculating areas.
  #1? separate in different dataframes by number of paintings of each artwork.
  #2? extarct each measure of each painting 
  #3? calculate area of each painting
  #4? the area of each artwork is caculated as the sum of the areas of the painting that compose it.

    dfc81 = df8[which(df8$n == 1), ]
    radio = as.numeric(dfc81$dim2)
    carea = radio*radio*3.14
    dfc81$area = carea

    dfc82 = df8[which(df8$n == 3), ]
    radio1 = as.numeric(sub("(\\d+\\.*\\d+)-(\\d+\\.*\\d+)", "\\1",  dfc82$dim2))
    radio2 = as.numeric(sub("(\\d+\\.*\\d+)-(\\d+\\.*\\d+)", "\\2",  dfc82$dim2))
    carea1 = radio1*radio1*3.14
    carea2 = radio2*radio2*3.14
    dfc82$area = carea1+carea2

 # unifying into a single database.
    dfarea = data.frame(rbind(df81, df82, df83, df84, df85, df86, dfc81, dfc82))

  # corecting "n",  so it indicates number of painting by artwork
    dfarea$n =ifelse(dfarea$n > 2, dfarea$n-1, dfarea$n)

  # sort data by "area"
    dfarea = data.frame(dfarea[order(dfarea$area), ])

  # final result.
    dfareah5l5 = data.frame(rbind(head(dfarea,  5), tail(dfarea, 5)))
    dfareah5l5

sebastianbarfort / sds

Group 24: Assignment 1 #12

Question #1 ---------------

read the Data Set

to prepare the Data, need to extract the month from the date

create a new data frame by monthly stock

naming the variables

Question #2 --------

plot the graph

We believe that bar graph will be the most suitable for this data because we are ploting one variable against the frequency

we want the graph to be histogram (explain reson from above) stat="idenity" to let R know that it is a numerical, colour/fill to be RED, then we table the axis and give it a title and theme to be minimal

Question #3 ------------

or

Question #4 -----------

creating a new data frame

naming the variables

Question #5 ----------

Question #6 ------------

Question #7 -------------

Question #8 ---------------