juliasilge / widyr

Widen, process, and re-tidy a dataset
http://juliasilge.github.io/widyr/
Other
327 stars 29 forks source link

pairwise_similarity has an undocumented minimum #23

Open DataStrategist opened 5 years ago

DataStrategist commented 5 years ago

a bit of a longwinded RepEx, but:

library(gutenbergr)
#> Warning: package 'gutenbergr' was built under R version 3.5.3
library(tidytext)
library(tidyverse)
#> -- Attaching packages ------------------------------------------------------------------- tidyverse 1.2.1 --
#> v ggplot2 3.1.0       v purrr   0.3.1  
#> v tibble  2.0.1       v dplyr   0.8.0.1
#> v tidyr   0.8.3       v stringr 1.3.1  
#> v readr   1.3.1       v forcats 0.3.0
#> -- Conflicts ---------------------------------------------------------------------- tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()
library(widyr)
#> Warning: package 'widyr' was built under R version 3.5.3

TI <- gutenberg_works(title == "Treasure Island") %>% pull(gutenberg_id) %>% 
  gutenberg_download(.) %>% unnest_tokens(., word, text) %>% 
  count(word, sort = TRUE) %>% mutate(source = "T.I.")
#> Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
#> Using mirror http://aleph.gutenberg.org

Wi <- gutenberg_works(title == "The Wonderful Wizard of Oz") %>% pull(gutenberg_id) %>% 
  gutenberg_download(.) %>% unnest_tokens(., word, text) %>% 
  count(word, sort = TRUE)  %>% mutate(source = "Wiz")

Co <- gutenberg_works(title == "The United States Constitution") %>% pull(gutenberg_id) %>% 
  gutenberg_download(.) %>% unnest_tokens(., word, text) %>% 
  count(word, sort = TRUE)  %>% mutate(source = "Con")

JFK <- gutenberg_works(title == "John F. Kennedy's Inaugural Address") %>% pull(gutenberg_id) %>% 
  gutenberg_download(.) %>% unnest_tokens(., word, text) %>% 
  count(word, sort = TRUE)  %>% mutate(source = "JFK")

## Combine
df <- bind_rows(TI, Wi, Co, JFK)

## Do similarity
df %>% 
  bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
  pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE)
#> # A tibble: 6 x 3
#>   item1 item2 similarity
#>   <chr> <chr>      <dbl>
#> 1 Wiz   T.I.      0.349 
#> 2 Con   JFK       0.0513
#> 3 T.I.  JFK       0.0483
#> 4 Con   T.I.      0.0314
#> 5 Wiz   JFK       0.0301
#> 6 Con   Wiz       0.0155

## So far so good, but what if I wanted to see which is most likely to say "I love you"? 
Love <- tibble(word = rep("I love you", 10), source = "TEST") %>% unnest_tokens(word, word) %>% 
  count(source, word, sort = TRUE) 

## With four sources it's possible:
  bind_rows(Love, df) %>% 
  bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
  pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>% 
  filter(item1 == "TEST") %>% select(-item1)
#> # A tibble: 4 x 2
#>   item2 similarity
#>   <chr>      <dbl>
#> 1 T.I.      0.0654
#> 2 Wiz       0.0526
#> 3 JFK       0.0267
#> 4 Con       0

## But with only two, it errors out:
df2 <- bind_rows(TI, Wi)

bind_rows(Love, df2) %>% 
  bind_tf_idf(word, source, n) %>% arrange(desc(tf_idf)) %>%
  pairwise_similarity(source, word, tf_idf, upper = FALSE, sort = TRUE) %>% 
  filter(item1 == "TEST") %>% select(-item1)
#> Error in `colnames<-`(`*tmp*`, value = c("item1", "item2", "value")): attempt to set 'colnames' on an object with less than two dimensions

## How come?

The limit should probably be documented no?