IMPALA-Consortium / ctas

Time Series Outliers and Anomalies
https://impala-consortium.github.io/ctas/
Other
3 stars 2 forks source link

! NAs not allowed in dist for LOF! #35

Closed erblast closed 10 months ago

erblast commented 11 months ago
library(tsoa)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

data("tsoa_data", package = "tsoa")

tsoa_data$data <- tsoa_data$data %>%
  left_join(
    tsoa_data$subjects %>%
      select(subject_id, site),
    by = "subject_id"
  ) %>%
  filter(
    site != "1" | percent_rank(timepoint_rank) <= 0.25,
    .by = "site"
  ) %>%
  select(-site)

ls_tsoa_opt <- process_a_study(
  data = tsoa_data$data,
  subjects = tsoa_data$subjects,
  parameters = tsoa_data$parameters,
  custom_timeseries = tsoa_data$custom_timeseries,
  timeseries_features_to_calculate = c(
    "lof"
  ),
  default_minimum_timepoints_per_series = 3,
  default_minimum_subjects_per_series = 3,
  default_max_share_missing_timepoints_per_series = 0.5,
  default_generate_change_from_baseline = FALSE,
  autogenerate_timeseries = TRUE,
  optimize_sites_and_patients = TRUE
)
#> Error in `mutate()`:
#> ℹ In argument: `ts_features = list(...)`.
#> ℹ In row 19.
#> Caused by error in `lof()`:
#> ! NAs not allowed in dist for LOF!
#> Backtrace:
#>      ▆
#>   1. ├─tsoa::process_a_study(...)
#>   2. │ └─... %>% ungroup() at tsoa/R/tsoa.R:130:2
#>   3. ├─dplyr::ungroup(.)
#>   4. ├─dplyr::mutate(...)
#>   5. ├─dplyr:::mutate.data.frame(...)
#>   6. │ └─dplyr:::mutate_cols(.data, dplyr_quosures(...), by)
#>   7. │   ├─base::withCallingHandlers(...)
#>   8. │   └─dplyr:::mutate_col(dots[[i]], data, mask, new_columns)
#>   9. │     └─mask$eval_all_mutate(quo)
#>  10. │       └─dplyr (local) eval()
#>  11. ├─tsoa:::calculate_ts_features(...)
#>  12. │ └─tsoa:::calculate_lof(origvalues_distances) at tsoa/R/tsoa.R:535:4
#>  13. │   └─dbscan::lof(...) at tsoa/R/tsoa.R:431:2
#>  14. │     └─base::stop("NAs not allowed in dist for LOF!")
#>  15. └─base::.handleSimpleError(...)
#>  16.   └─dplyr (local) h(simpleError(msg, call))
#>  17.     └─rlang::abort(message, class = error_class, parent = parent, call = error_call)

Created on 2023-11-16 with reprex v2.0.2

proposed solution, the dist object causing the error is anyNA(this_dist) == TRUE, subject names can still be extracted. Add check for anyNA to calculate_lof and return NA.

I am not sure though because I do not understand when this happens. Obviously there are NA values for the dist objects that are being created that do not cause this error.

erblast commented 11 months ago

@pekkatii what do you think?

erblast commented 10 months ago

seems like this issue is fixed now? I do not see a change in the code that this can be attributed to. So let's leave it open for now so we can check whether this error is also absent when run on real data.

the following code now executes w/o error:

suppressPackageStartupMessages(library(tsoa))

data("tsoa_data", package = "tsoa")

tsoa_data$data <- tsoa_data$data %>%
  left_join(
    tsoa_data$subjects %>%
      select(subject_id, site),
    by = "subject_id"
  ) %>%
  filter(
    site != "1" | percent_rank(timepoint_rank) <= 0.25,
    .by = "site"
  ) %>%
  select(-site)

ls_tsoa_opt <- process_a_study(
  data = tsoa_data$data,
  subjects = tsoa_data$subjects,
  parameters = tsoa_data$parameters,
  custom_timeseries = tsoa_data$custom_timeseries,
  custom_reference_groups = tsoa_data$custom_reference_groups,
  default_timeseries_features_to_calculate = c(
    "lof"
  ),
  default_minimum_timepoints_per_series = 3,
  default_minimum_subjects_per_series = 3,
  default_max_share_missing_timepoints_per_series = 0.5,
  default_generate_change_from_baseline = FALSE,
  autogenerate_timeseries = TRUE,
  optimize_sites_and_patients = TRUE
)

Created on 2023-11-27 with reprex v2.0.2

pekkatii commented 10 months ago

I didn't address this in the latest version but no objections if it works nevertheless :) At least I haven't noticed the error in any of the real data runs I have made.

erblast commented 10 months ago

the current simulated test data was generated w/o setting the seed, so its not reproducible, if we set the seed the error reappears:

suppressPackageStartupMessages(library(tsoa))

## code to prepare `DATASET` dataset goes here

library(dplyr)
library(tibble)
library(purrr)
library(tidyr)

set.seed(1)

timepoint_names <- combn(LETTERS, 2, FUN = paste, collapse = "")

region_count <- 3

ts <- tibble::tibble(
  region = seq(1, region_count)
) %>%
  dplyr::mutate(
    region = LETTERS[region],
    country = rpois(nrow(.), lambda = 3),
    country = purrr::map(country, ~ seq(1, .))
  ) %>%
  tidyr::unnest(country)  %>%
  dplyr::mutate(
    country = paste0(region, LETTERS[country]),
    site = rpois(nrow(.), lambda = 4),
    site = purrr::map(site, ~ seq(1, .))
  ) %>%
  tidyr::unnest(site)  %>%
  dplyr::mutate(
    site = paste0(country, LETTERS[site]),
    subject_id = rpois(nrow(.), lambda = 5),
    subject_id = purrr::map(subject_id, ~ seq(1, .))
  ) %>%
  tidyr::unnest(subject_id)  %>%
  dplyr::mutate(
    subject_id = row_number(),
    subject_id = as.character(subject_id)
  ) %>%
  dplyr::mutate(
    timepoint_rank = rpois(nrow(.), lambda = 20),
    timepoint_rank = purrr::map(timepoint_rank, ~ seq(1, .))
  ) %>%
  tidyr::unnest(timepoint_rank) %>%
  dplyr::mutate(
    timepoint_1_name = timepoint_names[timepoint_rank],
  )

get_parameter <- function(ts, name) {

  rnorm_param <- ts %>%
    distinct(subject_id) %>%
    mutate(
      avg = rnorm(nrow(.), 30, 5),
      sd = runif(nrow(.), min = 1, max = 10),
    )

  ts %>%
    left_join(rnorm_param, by = "subject_id") %>%
    mutate(
      result = purrr::map2_dbl(avg, sd, function(x,y) rnorm(1, x, y)),
      result = ifelse(runif(nrow(.), 0, 1) <= 0.3, NA, result),
      parameter_id = name
    ) %>%
    select(- avg, - sd)
}

param1 <- get_parameter(ts, "param1")
param2 <- get_parameter(ts, "param2")

data <- bind_rows(
  param1,
  param2
) %>%
  mutate(
    timepoint_2_name = "timepoint name 2",
    baseline = NA
  ) %>%
  select(- site, - country, -region)

parameters <- data %>%
  distinct(parameter_id) %>%
  mutate(
    parameter_name = parameter_id,
    parameter_category_1 = "category 1",
    parameter_category_2 = "category 2",
    parameter_category_3 = "category 3",
    time_point_count_min = NA,
    subject_count_min = NA,
    max_share_missing = NA,
    generate_change_from_baseline = NA,
    timeseries_features_to_calculate = NA,
    use_only_custom_timeseries = FALSE
  )

subjects <- ts %>%
  distinct(subject_id, site, country, region)

custom_timeseries <- tibble(
  timeseries_id = character(),
  parameter_id = character(),
  timepoint_combo = character()
)

custom_reference_groups <- tibble(
  parameter_id = character(),
  feature = character(),
  ref_group = character()
)

tsoa_data <- list(
  data = data,
  parameters = parameters,
  subjects = subjects,
  custom_timeseries = custom_timeseries,
  custom_reference_groups = custom_reference_groups
)

tsoa_data$data <- tsoa_data$data %>%
  left_join(
    tsoa_data$subjects %>%
      select(subject_id, site),
    by = "subject_id"
  ) %>%
  filter(
    site != "1" | percent_rank(timepoint_rank) <= 0.25,
    .by = "site"
  ) %>%
  select(-site)

ls_tsoa_opt <- process_a_study(
  data = tsoa_data$data,
  subjects = tsoa_data$subjects,
  parameters = tsoa_data$parameters,
  custom_timeseries = tsoa_data$custom_timeseries,
  custom_reference_groups = tsoa_data$custom_reference_groups,
  default_timeseries_features_to_calculate = c(
    "lof"
  ),
  default_minimum_timepoints_per_series = 3,
  default_minimum_subjects_per_series = 3,
  default_max_share_missing_timepoints_per_series = 0.5,
  default_generate_change_from_baseline = FALSE,
  autogenerate_timeseries = TRUE,
  optimize_sites_and_patients = TRUE
)
#> Error in `mutate()`:
#> ℹ In argument: `ts_features = list(...)`.
#> ℹ In row 6.
#> Caused by error in `lof()`:
#> ! NAs not allowed in dist for LOF!
#> Backtrace:
#>      ▆
#>   1. ├─tsoa::process_a_study(...)
#>   2. │ └─... %>% ungroup() at tsoa/R/tsoa.R:134:2
#>   3. ├─dplyr::ungroup(.)
#>   4. ├─dplyr::mutate(...)
#>   5. ├─dplyr:::mutate.data.frame(...)
#>   6. │ └─dplyr:::mutate_cols(.data, dplyr_quosures(...), by)
#>   7. │   ├─base::withCallingHandlers(...)
#>   8. │   └─dplyr:::mutate_col(dots[[i]], data, mask, new_columns)
#>   9. │     └─mask$eval_all_mutate(quo)
#>  10. │       └─dplyr (local) eval()
#>  11. ├─tsoa:::calculate_ts_features(...)
#>  12. │ └─tsoa:::calculate_lof(origvalues_distances) at tsoa/R/tsoa.R:567:4
#>  13. │   └─dbscan::lof(...) at tsoa/R/tsoa.R:461:2
#>  14. │     └─base::stop("NAs not allowed in dist for LOF!")
#>  15. └─base::.handleSimpleError(...)
#>  16.   └─dplyr (local) h(simpleError(msg, call))
#>  17.     └─rlang::abort(message, class = error_class, parent = parent, call = error_call)

Created on 2023-11-30 with reprex v2.0.2

pekkatii commented 10 months ago

Fixed with the latest update by Björn