hafen / trelliscopejs

TrelliscopeJS R Package
https://hafen.github.io/trelliscopejs
Other
263 stars 37 forks source link

Error in hist.default(x, breaks = brks, plot = FALSE) : some 'x' not counted; maybe 'breaks' do not span range of 'x' #64

Closed razsultana closed 5 years ago

razsultana commented 6 years ago

Hi! When running trelliscope(), creating plots with a factor variable on the x axis and a continuous variable on the y axis, I am receiving the error similar, but not identical with the one in issue #55 :

Error in hist.default(x, breaks = brks, plot = FALSE) : 
 some 'x' not counted; maybe 'breaks' do not span range of 'x'

I have posted below a reproducible example, showing that the function works fine for some subsets of the data, but the error is not clearly related to any particular row in the data.

traceback() points to the same function as issue #55 , get_cog_distributions:

> traceback()
17: stop("some 'x' not counted; maybe 'breaks' do not span range of 'x'")
16: hist.default(x, breaks = brks, plot = FALSE)
15: hist(x, breaks = brks, plot = FALSE)
14: FUN(X[[i]], ...)
13: lapply(cogdf, function(x) {
        type <- attr(x, "cog_attrs")$type
        if (inherits(x, c("Date", "POSIXct"))) {
            type <- "factor"
            x <- as.character(x)
        }
        res <- list(type = type, dist = NULL)
        if (type == "factor") {
            res$has_dist <- FALSE
            if (length(unique(x)) <= cat_cutoff) {
                x[is.na(x)] <- "NA"
                tmp <- table(x)
                tmp <- tmp[order(tmp, decreasing = TRUE)]
                res$dist <- as.list(tmp)
                res$has_dist <- TRUE
                res$max <- as.numeric(max(tmp))
            }
        }
        else if (type == "numeric") {
            skw <- DistributionUtils::skewness(x, na.rm = TRUE)
            hst <- graphics::hist(x, plot = FALSE)
            res <- list(type = type, dist = list(raw = list(breaks = hst$breaks, 
                freq = hst$counts)))
            res$log_default <- FALSE
            if (!is.nan(skw) && skw > 1.5 && all(x >= 0, na.rm = TRUE) && 
                length(x[x > 0]) > 1) {
                x <- x[x > 0]
                x2 <- log10(x)
                rng <- range(x2, na.rm = TRUE)
                brks <- 10^seq(rng[1], rng[2], length = grDevices::nclass.Sturges(x))
                lhst <- hist(x, breaks = brks, plot = FALSE)
                res$dist$log <- list(breaks = lhst$breaks, freq = lhst$counts)
                res$log_default <- TRUE
            }
        }
        res
    })
12: get_cog_distributions(cogdf)
11: write_display_obj(cog_df, panel_example = panels[[1]], panel_img_col = panel_img_col, 
        base_path = params$path, id = params$id, name = params$name, 
        group = params$group, desc = desc, height = height, width = width, 
        md_desc = md_desc, state = params$state, jsonp = params$jsonp, 
        self_contained = params$self_contained, thumb = params$thumb, 
        pb = pb)
10: trelliscope.data.frame(., name = "counts_vs_groups", panel_col = "panel", 
        nrow = 2, ncol = 5, path = "mydisplay", state = list(sort = list(sort_spec("padj")), 
            labels = c("symbol", "description", "padj")))
9: trelliscope(., name = "counts_vs_groups", panel_col = "panel", 
       nrow = 2, ncol = 5, path = "mydisplay", state = list(sort = list(sort_spec("padj")), 
           labels = c("symbol", "description", "padj")))
8: function_list[[k]](value)
7: withVisible(function_list[[k]](value))
6: freduce(value, `_function_list`)
5: `_fseq`(`_lhs`)
4: eval(quote(`_fseq`(`_lhs`)), env, env)
3: eval(quote(`_fseq`(`_lhs`)), env, env)
2: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
1: by_target_id %>% trelliscope(name = "counts_vs_groups", panel_col = "panel", 
       nrow = 2, ncol = 5, path = "mydisplay", state = list(sort = list(sort_spec("padj")), 
           labels = c("symbol", "description", "padj")))

Here is the reproducible example, with cases that work and cases that don't:

library('tidyverse')
library('rbokeh')
library('trelliscopejs')

obs <- read_csv(url('http://jabsom.from-hi.com/core/proj-YdT0i1NZ5gxJwNDB/test_trajectories.csv'))
#> Parsed with column specification:
#> cols(
#>   grp = col_character(),
#>   target_id = col_character(),
#>   mean_est_counts = col_double(),
#>   symbol = col_character(),
#>   description = col_character(),
#>   padj = col_double()
#> )
obs <- obs %>% mutate(grp = fct_recode(grp, "1-FetalControl" = "FetalControl", "2-ChildAutism" = "ChildAutism", "3-ChildControl" = "ChildControl"))
by_target_id <- obs %>% 
  ungroup() %>%
  group_by(target_id, symbol, description, padj) %>%
  nest()

# add in a plot column with map_plot

by_target_id <- by_target_id %>% mutate(
  panel = map_plot(data,
                   ~ figure(ylim = c(min(log(obs$mean_est_counts)) - 1, max(log(obs$mean_est_counts)) + 1), 
                            xlab = "Group", ylab = "log(counts)", tools = NULL) %>%
                     ly_points(grp, log(mean_est_counts), data = .x, 
                               hover = data_frame(Group = .x$grp,
                                                  Counts = .x$mean_est_counts)) %>% 
                     ly_lines(grp, log(mean_est_counts), data = .x)
                   )
  )

# plot it - fails with
## Error in hist.default(x, breaks = brks, plot = FALSE) : 
## some 'x' not counted; maybe 'breaks' do not span range of 'x'
try({ 
  t1 <- by_target_id %>%
    trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
                state = list(sort = list(sort_spec("padj")), 
                             labels = c('symbol', 'description', 'padj')) 
    )
})

# plot excluding the last 3 rows - works!
t2 <- by_target_id %>% head(122) %>%
  trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
              state = list(sort = list(sort_spec("padj")), 
                           labels = c('symbol', 'description', 'padj')) 
  )

# plot excluding the last 2 rows - fails with
## Error in hist.default(x, breaks = brks, plot = FALSE) : 
## some 'x' not counted; maybe 'breaks' do not span range of 'x'
try({
  t3 <- by_target_id %>% head(122) %>%
    trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
                state = list(sort = list(sort_spec("padj")), 
                             labels = c('symbol', 'description', 'padj')) 
    )
})

# plot the last 3 rows - works!
t4 <- by_target_id %>% tail(3) %>%
  trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
              state = list(sort = list(sort_spec("padj")), 
                           labels = c('symbol', 'description', 'padj')) 
  )

# plot the last 83 rows - works!
t5 <- by_target_id %>% tail(83) %>%
  trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
              state = list(sort = list(sort_spec("padj")), 
                           labels = c('symbol', 'description', 'padj')) 
  )

# plot the last 84 rows - fails with
## Error in hist.default(x, breaks = brks, plot = FALSE) : 
## some 'x' not counted; maybe 'breaks' do not span range of 'x'
try({
  t6 <- by_target_id %>% tail(84) %>%
    trelliscope(name = "counts_vs_groups", panel_col = 'panel', nrow = 2, ncol = 5, path = 'mydisplay', 
                state = list(sort = list(sort_spec("padj")), 
                             labels = c('symbol', 'description', 'padj')) 
    )
})

Created on 2018-07-19 by the reprex package (v0.2.0).

Session info ``` r devtools::session_info() #> Session info ------------------------------------------------------------- #> setting value #> version R version 3.4.4 (2018-03-15) #> system x86_64, linux-gnu #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> tz Pacific/Honolulu #> date 2018-07-19 #> Packages ----------------------------------------------------------------- #> package * version date #> assertthat 0.2.0 2017-04-11 #> autocogs 0.0.1 2018-07-19 #> backports 1.1.2 2017-12-13 #> base * 3.4.4 2018-03-15 #> base64enc 0.1-3 2015-07-28 #> bindr 0.1.1 2018-03-13 #> bindrcpp * 0.2.2 2018-03-29 #> broom 0.5.0 2018-07-17 #> cellranger 1.1.0 2016-07-27 #> checkmate 1.8.5 2017-10-24 #> cli 1.0.0 2017-11-05 #> codetools 0.2-15 2016-10-05 #> colorspace 1.3-2 2016-12-14 #> compiler 3.4.4 2018-03-15 #> crayon 1.3.4 2017-09-16 #> datasets * 3.4.4 2018-03-15 #> devtools 1.13.6 2018-06-27 #> digest 0.6.15 2018-01-28 #> DistributionUtils 0.5-1 2015-01-05 #> dplyr * 0.7.6 2018-06-29 #> evaluate 0.10.1 2017-06-24 #> forcats * 0.3.0 2018-02-19 #> ggplot2 * 3.0.0 2018-07-03 #> gistr 0.4.2 2018-06-28 #> glue 1.3.0 2018-07-17 #> graphics * 3.4.4 2018-03-15 #> grDevices * 3.4.4 2018-03-15 #> grid 3.4.4 2018-03-15 #> gtable 0.2.0 2016-02-26 #> haven 1.1.2 2018-06-27 #> hexbin 1.27.2 2018-01-15 #> hms 0.4.2 2018-03-10 #> htmltools 0.3.6 2017-04-28 #> htmlwidgets 1.2 2018-04-19 #> httr 1.3.1 2017-08-20 #> jsonlite 1.5 2017-06-01 #> knitr 1.20 2018-02-20 #> labeling 0.3 2014-08-23 #> lattice 0.20-35 2017-03-25 #> lazyeval 0.2.1 2017-10-29 #> lubridate 1.7.4 2018-04-11 #> magrittr 1.5 2014-11-22 #> maps 3.3.0 2018-04-03 #> mclust 5.4.1 2018-06-27 #> memoise 1.1.0 2017-04-21 #> methods * 3.4.4 2018-03-15 #> modelr 0.1.2 2018-05-11 #> munsell 0.5.0 2018-06-12 #> nlme 3.1-137 2018-04-07 #> pillar 1.3.0 2018-07-14 #> pkgconfig 2.0.1 2017-03-21 #> plyr 1.8.4 2016-06-08 #> prettyunits 1.0.2 2015-07-13 #> progress 1.2.0 2018-06-14 #> pryr 0.1.4 2018-02-18 #> purrr * 0.2.5 2018-05-29 #> R6 2.2.2 2017-06-17 #> rbokeh * 0.5.1 2018-07-20 #> Rcpp 0.12.17 2018-05-18 #> readr * 1.1.1 2017-05-16 #> readxl 1.1.0 2018-04-20 #> rlang 0.2.1 2018-05-30 #> rmarkdown 1.10 2018-06-11 #> rprojroot 1.3-2 2018-01-03 #> rvest 0.3.2 2016-06-17 #> scales 0.5.0 2017-08-24 #> stats * 3.4.4 2018-03-15 #> stringi 1.2.3 2018-06-12 #> stringr * 1.3.1 2018-05-10 #> tibble * 1.4.2 2018-01-22 #> tidyr * 0.8.1 2018-05-18 #> tidyselect 0.2.4 2018-02-26 #> tidyverse * 1.2.1 2017-11-14 #> tools 3.4.4 2018-03-15 #> trelliscopejs * 0.1.13 2018-07-20 #> utils * 3.4.4 2018-03-15 #> webshot 0.5.0 2017-11-29 #> withr 2.1.2 2018-03-15 #> xml2 1.2.0 2018-01-24 #> yaml 2.1.19 2018-05-01 #> source #> CRAN (R 3.4.3) reproducible example #> Github (hafen/autocogs@5159173) #> CRAN (R 3.4.0) #> local #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> local #> CRAN (R 3.4.0) #> local #> cran (@1.13.6) #> CRAN (R 3.4.0) #> cran (@0.5-1) #> cran (@0.7.6) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> CRAN (R 3.4.4) #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> local #> local #> local #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> cran (@1.2) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> local #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.0) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> Github (hafen/rbokeh@78e38a4) #> CRAN (R 3.4.3) #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.4) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> local #> cran (@1.2.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.3) #> CRAN (R 3.4.0) #> CRAN (R 3.4.0) #> local #> Github (hafen/trelliscopejs@4be901e) #> local #> CRAN (R 3.4.0) #> CRAN (R 3.4.3) #> CRAN (R 3.4.0) #> cran (@2.1.19) ```

Any thoughts ? Thank you, Razvan

razsultana commented 6 years ago

It looks like the offending code is in the get_cog_distributions function in helpers.R when the cognostic variable is numeric:

x <- x[x > 0]
x2 <- log10(x)
rng <- range(x2, na.rm = TRUE)
brks <- 10^seq(rng[1], rng[2], length = grDevices::nclass.Sturges(x))
lhst <- hist(x, breaks = brks, plot = FALSE)

In my example the problem is with the cognostic variable padj which contains p-values. I think the problem is the rounding error generated by taking log10 and then exponentiating back. In some situations, you get the first break that is bigger than the minimum of the original variable and what is mind boggling is that unless you change the default number of digits shown for floating point numbers you can't tell they are different! - see here:

> x <- by_target_id$padj
> x <- x[x > 0]
> x2 <- log10(x)
> rng <- range(x2, na.rm = TRUE)
> brks <- 10^seq(rng[1], rng[2], length = grDevices::nclass.Sturges(x))
> lhst <- hist(x, breaks = brks, plot = FALSE)
Error in hist.default(x, breaks = brks, plot = FALSE) : 
  some 'x' not counted; maybe 'breaks' do not span range of 'x'
> 
> min(x)
[1] 4.895243e-56
> brks[1]
[1] 4.895243e-56
> options(digits = 15)
> min(x)
[1] 4.89524328779376e-56
> brks[1]
[1] 4.89524328779379e-56

This can be easily (albeit not so elegantly) fixed by adding these 2 lines to the code:

brks[1] <- min(min(x), brks[1])
brks[length(brks)] <- max(max(x), brks[length(brks)])

BTW, this is a very common problem when using floating point numbers, so common that Oracle included in an appendix of their numerical computation guide this 1991 paper: https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html

Do you want me to create a PR adding the above 2 lines? This bug has been hounding me for a month now.