jkhamilton / MP3

0 stars 0 forks source link

removing duplicates #4

Open zrubinstein opened 7 years ago

zrubinstein commented 7 years ago

df <- db %>% dbGetQuery("SELECT DISTINCT t.id, t.title, t.kind_id, t.production_year, ml.link_type_id FROM imdb.title AS t JOIN imdb.movie_link AS ml ON ml.movie_id=t.id WHERE t.kind_id=1 AND ml.link_type_id=1 OR ml.link_type_id=3")

jkhamilton commented 7 years ago

sequels <- df %>% group_by(production_year) %>% filter(link_type_id == 2) %>% summarize(num_sequels = n_distinct(title, na.rm = TRUE))

remakes <- df %>% group_by(production_year) %>% filter(link_type_id == 4) %>% summarize(num_remakes = n_distinct(title, na.rm = TRUE))

total <- db %>% dbGetQuery("SELECT t.production_year AS year2, count(t.id) AS num_films FROM title AS t WHERE t.kind_id = 1 GROUP BY t.production_year;")

all <- inner_join(sequels, remakes, by = "production_year") %>% inner_join(total, by = "production_year") %>% mutate(percent_sequels = num_sequels/num_films100, percent_remakes = num_remakes/num_films100)