DiskFrame / disk.frame

Fast Disk-Based Parallelized Data Manipulation Framework for Larger-than-RAM Data
https://diskframe.com
Other
594 stars 40 forks source link

debug this! bloomfilter feature #247

Open xiaodaigh opened 4 years ago

xiaodaigh commented 4 years ago
library(disk.frame)
a = data.frame(id = sample(1:100, 1000, replace=TRUE), values = runif(1000))
adf = as.disk.frame(a, nchunks = 6)

adf_sharded = adf %>% 
  mutate(rand_chunk = sample(1:2,n(), replace=TRUE)) %>% # create a new column to sharding into sub-shards
  shard(shardby =  c("id", "rand_chunk")) 

adf_with_bloomfilter = adf_sharded %>% 
  make_bloomfilter("id")

adf_with_bloomfilter %>% 
  bf_likely_in_chunks("id", 1)

adf_with_bloomfilter %>% 
  use_bloom_filter("id", 1) %>% 
  collect

a = data.frame(id3 = sample(letters, 1000, replace=TRUE), values = runif(1000))
adf = as.disk.frame(a, nchunks = 6)

adf_sharded = adf %>% 
  #mutate(rand_chunk = sample(1:2,n(), replace=TRUE)) %>% # create a new column to sharding into sub-shards
  #shard(shardby =  c("id3", "rand_chunk"))
  shard(shardby =  c("id3"))

df = adf_sharded %>% 
  make_bloomfilter("id3")

df %>% 
  bf_likely_in_chunks("id3", "a")

df %>% 
  use_bloom_filter("id3", "a") %>% 
  collect
xiaodaigh commented 4 years ago

this works


df = nycflights13::flights %>% as.disk.frame(shardby = c("carrier"))
make_bloomfilter(df, "carrier")
expect_true(length(bf_likely_in_chunks(df, "carrier", "UA")) == 1)

use_bloom_filter(df, "carrier", "UA") %>% collect