Closed erblast closed 10 months ago
@pekkatii what do you think?
seems like this issue is fixed now? I do not see a change in the code that this can be attributed to. So let's leave it open for now so we can check whether this error is also absent when run on real data.
the following code now executes w/o error:
suppressPackageStartupMessages(library(tsoa))
data("tsoa_data", package = "tsoa")
tsoa_data$data <- tsoa_data$data %>%
left_join(
tsoa_data$subjects %>%
select(subject_id, site),
by = "subject_id"
) %>%
filter(
site != "1" | percent_rank(timepoint_rank) <= 0.25,
.by = "site"
) %>%
select(-site)
ls_tsoa_opt <- process_a_study(
data = tsoa_data$data,
subjects = tsoa_data$subjects,
parameters = tsoa_data$parameters,
custom_timeseries = tsoa_data$custom_timeseries,
custom_reference_groups = tsoa_data$custom_reference_groups,
default_timeseries_features_to_calculate = c(
"lof"
),
default_minimum_timepoints_per_series = 3,
default_minimum_subjects_per_series = 3,
default_max_share_missing_timepoints_per_series = 0.5,
default_generate_change_from_baseline = FALSE,
autogenerate_timeseries = TRUE,
optimize_sites_and_patients = TRUE
)
Created on 2023-11-27 with reprex v2.0.2
I didn't address this in the latest version but no objections if it works nevertheless :) At least I haven't noticed the error in any of the real data runs I have made.
the current simulated test data was generated w/o setting the seed, so its not reproducible, if we set the seed the error reappears:
suppressPackageStartupMessages(library(tsoa))
## code to prepare `DATASET` dataset goes here
library(dplyr)
library(tibble)
library(purrr)
library(tidyr)
set.seed(1)
timepoint_names <- combn(LETTERS, 2, FUN = paste, collapse = "")
region_count <- 3
ts <- tibble::tibble(
region = seq(1, region_count)
) %>%
dplyr::mutate(
region = LETTERS[region],
country = rpois(nrow(.), lambda = 3),
country = purrr::map(country, ~ seq(1, .))
) %>%
tidyr::unnest(country) %>%
dplyr::mutate(
country = paste0(region, LETTERS[country]),
site = rpois(nrow(.), lambda = 4),
site = purrr::map(site, ~ seq(1, .))
) %>%
tidyr::unnest(site) %>%
dplyr::mutate(
site = paste0(country, LETTERS[site]),
subject_id = rpois(nrow(.), lambda = 5),
subject_id = purrr::map(subject_id, ~ seq(1, .))
) %>%
tidyr::unnest(subject_id) %>%
dplyr::mutate(
subject_id = row_number(),
subject_id = as.character(subject_id)
) %>%
dplyr::mutate(
timepoint_rank = rpois(nrow(.), lambda = 20),
timepoint_rank = purrr::map(timepoint_rank, ~ seq(1, .))
) %>%
tidyr::unnest(timepoint_rank) %>%
dplyr::mutate(
timepoint_1_name = timepoint_names[timepoint_rank],
)
get_parameter <- function(ts, name) {
rnorm_param <- ts %>%
distinct(subject_id) %>%
mutate(
avg = rnorm(nrow(.), 30, 5),
sd = runif(nrow(.), min = 1, max = 10),
)
ts %>%
left_join(rnorm_param, by = "subject_id") %>%
mutate(
result = purrr::map2_dbl(avg, sd, function(x,y) rnorm(1, x, y)),
result = ifelse(runif(nrow(.), 0, 1) <= 0.3, NA, result),
parameter_id = name
) %>%
select(- avg, - sd)
}
param1 <- get_parameter(ts, "param1")
param2 <- get_parameter(ts, "param2")
data <- bind_rows(
param1,
param2
) %>%
mutate(
timepoint_2_name = "timepoint name 2",
baseline = NA
) %>%
select(- site, - country, -region)
parameters <- data %>%
distinct(parameter_id) %>%
mutate(
parameter_name = parameter_id,
parameter_category_1 = "category 1",
parameter_category_2 = "category 2",
parameter_category_3 = "category 3",
time_point_count_min = NA,
subject_count_min = NA,
max_share_missing = NA,
generate_change_from_baseline = NA,
timeseries_features_to_calculate = NA,
use_only_custom_timeseries = FALSE
)
subjects <- ts %>%
distinct(subject_id, site, country, region)
custom_timeseries <- tibble(
timeseries_id = character(),
parameter_id = character(),
timepoint_combo = character()
)
custom_reference_groups <- tibble(
parameter_id = character(),
feature = character(),
ref_group = character()
)
tsoa_data <- list(
data = data,
parameters = parameters,
subjects = subjects,
custom_timeseries = custom_timeseries,
custom_reference_groups = custom_reference_groups
)
tsoa_data$data <- tsoa_data$data %>%
left_join(
tsoa_data$subjects %>%
select(subject_id, site),
by = "subject_id"
) %>%
filter(
site != "1" | percent_rank(timepoint_rank) <= 0.25,
.by = "site"
) %>%
select(-site)
ls_tsoa_opt <- process_a_study(
data = tsoa_data$data,
subjects = tsoa_data$subjects,
parameters = tsoa_data$parameters,
custom_timeseries = tsoa_data$custom_timeseries,
custom_reference_groups = tsoa_data$custom_reference_groups,
default_timeseries_features_to_calculate = c(
"lof"
),
default_minimum_timepoints_per_series = 3,
default_minimum_subjects_per_series = 3,
default_max_share_missing_timepoints_per_series = 0.5,
default_generate_change_from_baseline = FALSE,
autogenerate_timeseries = TRUE,
optimize_sites_and_patients = TRUE
)
#> Error in `mutate()`:
#> ℹ In argument: `ts_features = list(...)`.
#> ℹ In row 6.
#> Caused by error in `lof()`:
#> ! NAs not allowed in dist for LOF!
#> Backtrace:
#> ▆
#> 1. ├─tsoa::process_a_study(...)
#> 2. │ └─... %>% ungroup() at tsoa/R/tsoa.R:134:2
#> 3. ├─dplyr::ungroup(.)
#> 4. ├─dplyr::mutate(...)
#> 5. ├─dplyr:::mutate.data.frame(...)
#> 6. │ └─dplyr:::mutate_cols(.data, dplyr_quosures(...), by)
#> 7. │ ├─base::withCallingHandlers(...)
#> 8. │ └─dplyr:::mutate_col(dots[[i]], data, mask, new_columns)
#> 9. │ └─mask$eval_all_mutate(quo)
#> 10. │ └─dplyr (local) eval()
#> 11. ├─tsoa:::calculate_ts_features(...)
#> 12. │ └─tsoa:::calculate_lof(origvalues_distances) at tsoa/R/tsoa.R:567:4
#> 13. │ └─dbscan::lof(...) at tsoa/R/tsoa.R:461:2
#> 14. │ └─base::stop("NAs not allowed in dist for LOF!")
#> 15. └─base::.handleSimpleError(...)
#> 16. └─dplyr (local) h(simpleError(msg, call))
#> 17. └─rlang::abort(message, class = error_class, parent = parent, call = error_call)
Created on 2023-11-30 with reprex v2.0.2
Fixed with the latest update by Björn
Created on 2023-11-16 with reprex v2.0.2
proposed solution, the dist object causing the error is
anyNA(this_dist) == TRUE
, subject names can still be extracted. Add check for anyNA tocalculate_lof
and return NA.I am not sure though because I do not understand when this happens. Obviously there are NA values for the dist objects that are being created that do not cause this error.