richfitz / storr

:package: Object cacher for R
http://richfitz.github.io/storr
Other
116 stars 10 forks source link

Some hashes change when importing one storr into another. #93

Open wlandau opened 5 years ago

wlandau commented 5 years ago

Portability and collaboration in drake would greatly improve if we could convert caches among different storr backends (ref: https://github.com/ropensci/unconf18/issues/30). Currently, I am having trouble with $import(). It appears to change the values of some of the hashes.

# Generate a cache and a flat log of hashes.
library(drake) # https://github.com/ropensci/drake/commit/6f0012a17c05fa16981d7e027240c40862790fa9
library(storr) # https://github.com/richfitz/storr/commit/27508231b3c061afed9bb243d4422f81291f2e94
load_mtcars_example()
make(my_plan, verbose = FALSE)
cache1 <- storr_rds(".drake")

# Try to convert the RDS cache into a DBI cache.
mydb <- DBI::dbConnect(RSQLite::SQLite(), "database-file.sqlite")
cache2 <- storr::storr_dbi(
  tbl_data = "datatable",
  tbl_keys = "keystable",
  con = mydb,
  hash_algorithm = cache1$driver$hash_algorithm
)
cache2$import(cache1, namespace = cache1$list_namespaces())

# Some targets have different hashes.
cache1$get_hash("regression1_large", namespace = "kernels")
#> [1] "e1501ed9d62b846e"
cache2$get_hash("regression1_large", namespace = "kernels")
#> [1] "378d49d1626bdd6f"

# Show more differences.
log1 <- drake_cache_log(cache = cache1)
log2 <- drake_cache_log(cache = cache2)
diff <- which(log1$hash != log2$hash)
log1[diff, ]
#> # A tibble: 2 x 3
#>   hash             type   name             
#>   <chr>            <chr>  <chr>            
#> 1 e1501ed9d62b846e target regression1_large
#> 2 2a400716e73eac8f target regression1_small
log2[diff, ]
#> # A tibble: 2 x 3
#>   hash             type   name             
#>   <chr>            <chr>  <chr>            
#> 1 378d49d1626bdd6f target regression1_large
#> 2 8c0111e4bce91e86 target regression1_small

Created on 2018-12-12 by the reprex package (v0.2.1)

richfitz commented 5 years ago

shorter:

> library(drake)
> library(storr)
> load_mtcars_example()
> make(my_plan, verbose = FALSE)
> cache1 <- storr_rds(".drake")
> cache1$get_hash("regression1_large", namespace = "kernels")
[1] "e1501ed9d62b846e"
> cache1$hash_object(cache1$get("regression1_large", namespace = "kernels"))
[1] "378d49d1626bdd6f"

and

> cache2 <- storr::storr_rds(tempfile(), hash_algorithm = "xxhash64")
> cache2$set("obj", obj)
> cache2$get_hash("obj")
[1] "378d49d1626bdd6f"
richfitz commented 5 years ago

It looks an awful lot like this is an environment serialisation issue

h <- setdiff(cache1$list_hashes(), cache2$list_hashes())
changed <- setNames(lapply(h, cache1$get_value), h)

which shows

$`031df699ec5b0faf`
<storr>
  Public:
    archive_export: function (path, names = NULL, namespace = NULL) 
    archive_import: function (path, names = NULL, namespace = NULL) 
    check: function (full = TRUE, quiet = FALSE, progress = !quiet) 
    clear: function (namespace = self$default_namespace) 
    clone: function (deep = FALSE) 
    default_namespace: objects
    del: function (key, namespace = self$default_namespace) 
    destroy: function () 
    driver: driver_rds, R6
    duplicate: function (key_src, key_dest, namespace = self$default_namespace, 
    envir: environment
    exists: function (key, namespace = self$default_namespace) 
    exists_object: function (hash) 
    export: function (dest, list = NULL, namespace = self$default_namespace, 
    fill: function (key, value, namespace = self$default_namespace, use_cache = TRUE) 
    flush_cache: function () 
    gc: function () 
    get: function (key, namespace = self$default_namespace, use_cache = TRUE) 
    get_hash: function (key, namespace = self$default_namespace) 
    get_value: function (hash, use_cache = TRUE) 
    hash_object: function (object) 
    hash_raw: function (x) 
    import: function (src, list = NULL, namespace = self$default_namespace, 
    index_export: function (namespace = NULL) 
    index_import: function (index) 
    initialize: function (driver, default_namespace) 
    list: function (namespace = self$default_namespace) 
    list_hashes: function () 
    list_namespaces: function () 
    mget: function (key, namespace = self$default_namespace, use_cache = TRUE, 
    mget_hash: function (key, namespace = self$default_namespace) 
    mget_value: function (hash, use_cache = TRUE, missing = NULL) 
    mset: function (key, value, namespace = self$default_namespace, use_cache = TRUE) 
    mset_by_value: function (value, namespace = self$default_namespace, use_cache = TRUE) 
    mset_value: function (values, use_cache = TRUE) 
    repair: function (storr_check_results = NULL, quiet = FALSE, ..., force = FALSE) 
    serialize_object: function (object) 
    set: function (key, value, namespace = self$default_namespace, use_cache = TRUE) 
    set_by_value: function (value, namespace = self$default_namespace, use_cache = TRUE) 
    set_value: function (value, use_cache = TRUE) 
    traits: list

$`2a400716e73eac8f`

Call:
lm(formula = y ~ +x, data = d)

Coefficients:
(Intercept)            x  
     36.663       -5.008  

$`410f8f336035ee86`
function (i) 
0.01
<environment: 0x51aac60>

$`5b6556b5a3ba478b`
IGRAPH b492388 DN-- 20 23 -- 
+ attr: name (v/c)
+ edges from b492388 (vertex names):
 [1] random_rows           ->simulate         
 [2] reg1                  ->regression1_small
 [3] reg1                  ->regression1_large
 [4] reg2                  ->regression2_small
 [5] reg2                  ->regression2_large
 [6] "report.Rmd"          ->report           
 [7] coef_regression2_small->report           
 [8] large                 ->report           
+ ... omitted several edges

$`901f3af361be4d69`
[1] ‘6.2.1’

$e1501ed9d62b846e

Call:
lm(formula = y ~ +x, data = d)

Coefficients:
(Intercept)            x  
     36.291       -5.108  

$f81d15390b8fc36e
IGRAPH b492388 DN-- 20 23 -- 
+ attr: name (v/c)
+ edges from b492388 (vertex names):
 [1] random_rows           ->simulate         
 [2] reg1                  ->regression1_small
 [3] reg1                  ->regression1_large
 [4] reg2                  ->regression2_small
 [5] reg2                  ->regression2_large
 [6] "report.Rmd"          ->report           
 [7] coef_regression2_small->report           
 [8] large                 ->report           
+ ... omitted several edges

which are all environment-containing with the possible exception of the version number

richfitz commented 5 years ago

ah, the version number is just not there in the second cache for reasons that also look suspicious