Open pepbioalerts opened 2 years ago
I'm getting errors at all my cd_round()
attempts too, and I think it's the same issue (maybe also related to #66). Here's a small reproducible example:
occs <- rgbif::occ_search(scientificName = "Daboia mauritanica")
# this runs (with ds="species", to simulate a single dataset):
CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "species", value = "dataset")
# but this fails (with ds="datasetKey", which is what we need to do):
CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "datasetKey", value = "dataset")
# Error in rbind.data.frame(`1c5c3e48-7fc0-4d4f-96e6-c4df2c747f34` = list( :
# numbers of columns of arguments do not match
# In addition: There were 15 warnings (use warnings() to see them)
warnings()
# Warning messages:
# 1: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
# 2: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
[...]
I found an error when the number of datasets to test is >1, and the columns do not match when some datasets cannot run the tests, while for some others you can. I attach here some bug fix hoping it helps
cc_round_v2<- function (x, lon = "decimallongitude", lat = "decimallatitude", ds = "dataset", T1 = 7, reg_out_thresh = 2, reg_dist_min = 0.1, reg_dist_max = 2, min_unique_ds_size = 4, graphs = F, test = "both", value = "flagged", verbose = TRUE) {
window_size <- 10 detection_rounding <- 2 detection_threshold <- 6 digit_round <- 0 nc <- 3000 rarefy <- FALSE match.arg(value, choices = c("flagged", "clean", "dataset","flagged2")) if (verbose) { message("Testing for rasterized collection") } if (length(unique(x[[ds]])) > 1) { dat <- split(x, f = x[[ds]]) out <- lapply(dat, function(k) {
} else { if (nrow(x[!duplicated(x[, c(lon, lat)]), ]) < min_unique_ds_size) { warning("Dataset smaller than minimum test size") out <- data.frame(dataset = unique(x[[ds]]), n.outliers = NA, n.regular.outliers = NA, regular.distance = NA, summary = NA) } else { if (test == "lon") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "summary") } if (test == "lat") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "summary") } if (test == "both") { gvec1 <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lon <- CoordinateCleaner:::.OutDetect(gvec1, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lon$flag <- !all(n_outl_lon$n.outliers > 0, n_outl_lon$regular.distance >= reg_dist_min, n_outl_lon$regular.distance <= reg_dist_max, n_outl_lon$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lon$flag, sep = " - ")) } gvec2 <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lat <- CoordinateCleaner:::.OutDetect(gvec2, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lat$flag <- !all(n_outl_lat$n.outliers > 0, n_outl_lat$regular.distance >= reg_dist_min, n_outl_lat$regular.distance <= reg_dist_max, n_outl_lat$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lat$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl_lon, n_outl_lat) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "lon.flag", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "lat.flag") n_outl$summary <- n_outl$lon.flag | n_outl$lat.flag } out <- n_outl } }
adding an option flagged2 as output value when a test has not been run (NA) , instead of having an F
out2 = merge (x,out[c('dataset','summary')], by.x= ds,by.y='dataset',all.x=T) switch(value, dataset = return(out), clean = return({ test <- x[x[[ds]] %in% out[out$summary, "dataset"], ] if (length(test) > 0) { test } else { NULL } }), flagged = return(x[[ds]] %in% out[out$summary, "dataset"]), flagged2 = return(out2$summary) ) }