jlin47 / mp4

0 stars 0 forks source link

Asking #7

Closed dasom1348 closed 6 years ago

dasom1348 commented 6 years ago
library(mdsr)
library(RMySQL)
db <- dbConnect_scidb(dbname = "imdb")
sql <- "
SELECT t.id, t.title, t.production_year, mii.info AS votes, mii2.info AS rating
FROM title t
JOIN movie_info_idx mii ON mii.movie_id = t.id
JOIN movie_info_idx mii2 ON mii2.movie_id = t.id
WHERE t.production_year >= 2008
AND t.kind_id = 1
AND mii.info_type_id = 100
AND mii2.info_type_id = 101
AND mii.info > 100000
ORDER BY mii2.info desc;
"
best_movies <- db %>%
  dbGetQuery(sql)
head(best_movies)

best_movies[!duplicated(best_movies[,c("title")]), ]

best_movies100 <- best_movies %>%
  head(100)
sql_2 <- "SELECT n.name, ci.role_id, person_id, ci.id, movie_id
FROM cast_info ci
JOIN name n ON n.id = ci.person_id
WHERE role_id = '8'
AND gender = 'f';
"
peopleF_info <- db %>%
  dbGetQuery(sql_2)
head(peopleF_info)
sql_3 <- "SELECT n.name, ci.role_id, person_id, ci.id, movie_id 
FROM cast_info ci
JOIN name n ON n.id = ci.person_id
WHERE role_id = '8'
AND gender = 'm';
"
peopleM_info <- db %>%
  dbGetQuery(sql_3)
head(peopleM_info)
Female_info<- peopleF_info %>%
  select(name, person_id, id, movie_id)%>%
  group_by(name, person_id, id, movie_id)%>%
  summarise(num=n()) 

Male_info<- peopleM_info %>%
  select(name, person_id, id, movie_id)%>%
  group_by(name, person_id, id, movie_id)%>%
  summarise(num=n()) 
# Join movie_id with best_movies100 for their movie_id

sql_4 <- "
SELECT movie_id, title, production_year, id 
FROM aka_title;
"
movie_id <- db %>%
  dbGetQuery(sql_4)

movie_id1<- movie_id %>%
  filter(production_year>=2008)
 group_by (movie_id, title) %>%
  summarise (num = n())

head(movie_id)

# best 100 movies with their movie_id
movie_id100 <- movie_id1 %>%
  inner_join(best_movies100, by = "title", "production_year") %>%
  group_by(title, movie_id, production_year, votes, rating) %>%
  summarize(num_1=n())

movie_id[!duplicated(movie_id[,c("title")]), ]

#Join people info data frames with best movies data frame
Movie_F<- movie_id %>%
  inner_join(Female_info, by = "movie_id")

Movie_M <- movie_id100 %>%
   inner_join(Male_info, by= "movie_id")

# Removing repeatition 
Movie_F <- Movie_F %>%
  group_by(movie_id, title, name, production_year, votes, rating, person_id) 
dasom1348 commented 6 years ago

Also, we might not figure out the revenue data in our database.. we can ask to professor but,, Even if we couldn't find it, since our goal is to check whether there is still gender bias in a film field like 'the films by female directors would have low quality' through real data, I think the high ratings and votes would mean high popularity and quality and maybe also cause high revenue. ratings and votes would be enough to prove high quality of the movie. So, it would be fine, if we are not able to find revenue data. How do u think about this??

jlin47 commented 6 years ago
library(mdsr)
library(RMySQL)
db <- dbConnect_scidb(dbname = "imdb")

db %>%
  dbGetQuery("SELECT * FROM kind_type;")

sql <- "
SELECT t.id, t.title, t.production_year, mii.info AS votes, mii2.info AS rating
FROM title t
JOIN movie_info_idx mii ON mii.movie_id = t.id
JOIN movie_info_idx mii2 ON mii2.movie_id = t.id
WHERE t.production_year >= 2008
AND t.kind_id = 1
AND mii.info_type_id = 100
AND mii2.info_type_id = 101
AND mii.info > 100000
ORDER BY mii2.info desc;
"
best_movies <- db %>%
  dbGetQuery(sql)
head(best_movies)

best_movies[!duplicated(best_movies[,c("title")]), ]

best_movies100 <- best_movies %>%
  head(100)
sql_2 <- "SELECT n.name, ci.role_id, person_id, ci.id, movie_id
FROM cast_info ci
JOIN name n ON n.id = ci.person_id
WHERE role_id = '8'
AND gender = 'f';
"
peopleF_info <- db %>%
  dbGetQuery(sql_2)
head(peopleF_info)

sql_3 <- "SELECT n.name, ci.role_id, person_id, ci.id, movie_id 
FROM cast_info ci
JOIN name n ON n.id = ci.person_id
WHERE role_id = '8'
AND gender = 'm';
"
peopleM_info <- db %>%
  dbGetQuery(sql_3)
head(peopleM_info)
Female_info<- peopleF_info %>%
  select(name, person_id, id, movie_id)%>%
  group_by(name, person_id, id, movie_id)%>%
  summarise(num=n()) 

Male_info<- peopleM_info %>%
  select(name, person_id, id, movie_id)%>%
  group_by(name, person_id, id, movie_id)%>%
  summarise(num=n()) 

# Join movie_id with best_movies100 for their movie_id

sql_4 <- "
SELECT movie_id, title, production_year, id 
FROM aka_title;
"
movie_id <- db %>%
  dbGetQuery(sql_4)

movie_id1<- movie_id %>%
  filter(production_year>=2008)
 group_by (movie_id, title) %>%
  summarise (num = n())

head(movie_id)

# best 100 movies with their movie_id
movie_id4 <- movie_id1 %>%
  inner_join(best_movies, by = c("title"= "title", "production_year" = "production_year")) 

movie_id100<-movie_id4 %>%
  head (100)

movie_id100[!duplicated(movie_id100[,c("title")]), ]

#Join people info data frames with best movies data frame
Movie_F<- movie_id100 %>%
  inner_join(Female_info, by = "movie_id")

movie_id100[!duplicated(Movie_F[,c("title")]), ]

Movie_M <- movie_id100 %>%
   inner_join(Male_info, by= "movie_id")

movie_id100[!duplicated(Movie_M[,c("title")]), ]

# Removing repeatition 
Movie_F <- Movie_F %>%
  group_by(movie_id, title, name, production_year, votes, rating, person_id) 
sql_gross <- "
SELECT t.id, t.title, mi.info AS gross_info
FROM title t
JOIN movie_info mi ON mi.movie_id = t.id;
"

gross <- db %>%
dbGetQuery(sql_gross)
head(gross)

dbGetQuery(db, "SELECT * FROM info_type WHERE info = 'gross';")

dbGetQuery(db, "
SELECT * FROM info_type 
WHERE info LIKE '%gross%';
")

movie_gross<- dbGetQuery(db, "
SELECT t.id, t.title, mi.info AS gross_info, movie_id, production_year
FROM title t
JOIN movie_info mi ON mi.movie_id = t.id
WHERE mi.info_type_id = 107
")

movie_gross_1 <- movie_gross %>%
  group_by (movie_id)%>%
  filter(grepl("USA", gross_info))%>%
  summarise(max(gross_info))

movie_gross100<- movie_gross_1%>%
  inner_join(movie_id4, by= c("movie_id"="movie_id"))%>%
  head(100)