ropensci / CoordinateCleaner

Automated flagging of common spatial and temporal errors in biological and palaeontological collection data, for the use in conservation, ecology and palaeontology.
https://docs.ropensci.org/CoordinateCleaner/
79 stars 21 forks source link

Issue in cd_round when ds >1 #71

Open pepbioalerts opened 2 years ago

pepbioalerts commented 2 years ago

I found an error when the number of datasets to test is >1, and the columns do not match when some datasets cannot run the tests, while for some others you can. I attach here some bug fix hoping it helps

cc_round_v2<- function (x, lon = "decimallongitude", lat = "decimallatitude", ds = "dataset", T1 = 7, reg_out_thresh = 2, reg_dist_min = 0.1, reg_dist_max = 2, min_unique_ds_size = 4, graphs = F, test = "both", value = "flagged", verbose = TRUE) {

window_size <- 10 detection_rounding <- 2 detection_threshold <- 6 digit_round <- 0 nc <- 3000 rarefy <- FALSE match.arg(value, choices = c("flagged", "clean", "dataset","flagged2")) if (verbose) { message("Testing for rasterized collection") } if (length(unique(x[[ds]])) > 1) { dat <- split(x, f = x[[ds]]) out <- lapply(dat, function(k) {

  tester <- k[complete.cases(k[, c(lon, lat)]), ]
  if (nrow(tester[!duplicated(tester[, c(lon, lat)]), 
  ]) < min_unique_ds_size) {
    #adding info on the dataset with a problem
    warning(paste0 (unique(k[[ds]])," :Dataset smaller than minimum test size"))
    # out <- data.frame(dataset = unique(x[[ds]]), 
    #                   n.outliers = NA, n.regular.outliers = NA, 
    #                   regular.distance = NA, summary = NA)
    out <- data.frame(dataset = unique(k[[ds]]),
                      n.outliers = NA, n.regular.outliers = NA,
                      regular.distance = NA, summary = NA)

  }
  else {
    if (test == "lon") {
      gvec <- CoordinateCleaner:::.CalcACT(data = k[[lon]], digit_round = digit_round, 
                                           nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
      n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, 
                                               detection_rounding = detection_rounding, 
                                               detection_threshold = detection_threshold, 
                                               graphs = graphs)
      n_outl$flag <- !all(n_outl$n.outliers > 0, 
                          n_outl$regular.distance >= reg_dist_min, 
                          n_outl$regular.distance <= reg_dist_max, 
                          n_outl$n.regular.outliers >= reg_out_thresh)
      if (graphs) {
        title(paste(unique(k[[ds]]), n_outl$flag, 
                    sep = " - "))
      }
      n_outl <- data.frame(unique(k[[ds]]), n_outl)
      names(n_outl) <- c("dataset", "lon.n.outliers", 
                         "lon.n.regular.distance", "lon.regular.distance", 
                         "summary")
    }
    if (test == "lat") {
      gvec <- CoordinateCleaner:::.CalcACT(data = k[[lat]], digit_round = digit_round, 
                                           nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
      n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, 
                                               detection_rounding = detection_rounding, 
                                               detection_threshold = detection_threshold, 
                                               graphs = graphs)
      n_outl$flag <- !all(n_outl$n.outliers > 0, 
                          n_outl$regular.distance >= reg_dist_min, 
                          n_outl$regular.distance <= reg_dist_max, 
                          n_outl$n.regular.outliers >= reg_out_thresh)
      if (graphs) {
        title(paste(unique(k[[ds]]), n_outl$flag, 
                    sep = " - "))
      }
      n_outl <- data.frame(unique(k[[ds]]), n_outl)
      names(n_outl) <- c("dataset", "lat.n.outliers", 
                         "lat.n.regular.distance", "lat.regular.distance", 
                         "summary")
    }
    if (test == "both") {
      gvec1 <- CoordinateCleaner:::.CalcACT(data = k[[lon]], digit_round = digit_round, 
                                            nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
      n_outl_lon <- CoordinateCleaner:::.OutDetect(gvec1, T1 = T1, window_size = window_size, 
                                                   detection_rounding = detection_rounding, 
                                                   detection_threshold = detection_threshold, 
                                                   graphs = graphs)
      n_outl_lon$flag <- !all(n_outl_lon$n.outliers > 
                                0, n_outl_lon$regular.distance >= reg_dist_min, 
                              n_outl_lon$regular.distance <= reg_dist_max, 
                              n_outl_lon$n.regular.outliers >= reg_out_thresh)
      if (graphs) {
        title(paste(unique(k[[ds]]), n_outl_lon$flag, 
                    sep = " - "))
      }
      gvec2 <- CoordinateCleaner:::.CalcACT(data = k[[lat]], digit_round = digit_round, 
                                            nc = nc, graphs = graphs, graph_title = unique(k[[ds]]))
      n_outl_lat <- CoordinateCleaner:::.OutDetect(gvec2, T1 = T1, window_size = window_size, 
                                                   detection_rounding = detection_rounding, 
                                                   detection_threshold = detection_threshold, 
                                                   graphs = graphs)
      n_outl_lat$flag <- !all(n_outl_lat$n.outliers > 
                                0, n_outl_lat$regular.distance >= reg_dist_min, 
                              n_outl_lat$regular.distance <= reg_dist_max, 
                              n_outl_lat$n.regular.outliers >= reg_out_thresh)
      if (graphs) {
        title(paste(unique(k[[ds]]), n_outl_lat$flag, 
                    sep = " - "))
      }
      n_outl <- cbind(unique(k[[ds]]), n_outl_lon, 
                      n_outl_lat)
      names(n_outl) <- c("dataset", "lon.n.outliers", 
                         "lon.n.regular.outliers", "lon.regular.distance", 
                         "lon.flag", "lat.n.outliers", "lat.n.regular.outliers", 
                         "lat.regular.distance", "lat.flag")
      n_outl$summary <- n_outl$lon.flag | n_outl$lat.flag
    }
    return(n_outl)
  }
})
#out <- do.call("rbind.data.frame", out)
out <- do.call("bind_rows", out)

} else { if (nrow(x[!duplicated(x[, c(lon, lat)]), ]) < min_unique_ds_size) { warning("Dataset smaller than minimum test size") out <- data.frame(dataset = unique(x[[ds]]), n.outliers = NA, n.regular.outliers = NA, regular.distance = NA, summary = NA) } else { if (test == "lon") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "summary") } if (test == "lat") { gvec <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl <- CoordinateCleaner:::.OutDetect(gvec, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl$flag <- !all(n_outl$n.outliers > 0, n_outl$regular.distance >= reg_dist_min, n_outl$regular.distance <= reg_dist_max, n_outl$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl) names(n_outl) <- c("dataset", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "summary") } if (test == "both") { gvec1 <- CoordinateCleaner:::.CalcACT(data = x[[lon]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lon <- CoordinateCleaner:::.OutDetect(gvec1, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lon$flag <- !all(n_outl_lon$n.outliers > 0, n_outl_lon$regular.distance >= reg_dist_min, n_outl_lon$regular.distance <= reg_dist_max, n_outl_lon$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lon$flag, sep = " - ")) } gvec2 <- CoordinateCleaner:::.CalcACT(data = x[[lat]], digit_round = digit_round, nc = nc, graphs = graphs, graph_title = unique(x[[ds]])) n_outl_lat <- CoordinateCleaner:::.OutDetect(gvec2, T1 = T1, window_size = window_size, detection_rounding = detection_rounding, detection_threshold = detection_threshold, graphs = graphs) n_outl_lat$flag <- !all(n_outl_lat$n.outliers > 0, n_outl_lat$regular.distance >= reg_dist_min, n_outl_lat$regular.distance <= reg_dist_max, n_outl_lat$n.regular.outliers >= reg_out_thresh) if (graphs) { title(paste(unique(x[[ds]]), n_outl_lat$flag, sep = " - ")) } n_outl <- data.frame(unique(x[[ds]]), n_outl_lon, n_outl_lat) names(n_outl) <- c("dataset", "lon.n.outliers", "lon.n.regular.distance", "lon.regular.distance", "lon.flag", "lat.n.outliers", "lat.n.regular.distance", "lat.regular.distance", "lat.flag") n_outl$summary <- n_outl$lon.flag | n_outl$lat.flag } out <- n_outl } }

adding an option flagged2 as output value when a test has not been run (NA) , instead of having an F

out2 = merge (x,out[c('dataset','summary')], by.x= ds,by.y='dataset',all.x=T) switch(value, dataset = return(out), clean = return({ test <- x[x[[ds]] %in% out[out$summary, "dataset"], ] if (length(test) > 0) { test } else { NULL } }), flagged = return(x[[ds]] %in% out[out$summary, "dataset"]), flagged2 = return(out2$summary) ) }

AMBarbosa commented 4 months ago

I'm getting errors at all my cd_round() attempts too, and I think it's the same issue (maybe also related to #66). Here's a small reproducible example:

occs <- rgbif::occ_search(scientificName = "Daboia mauritanica")

# this runs (with ds="species", to simulate a single dataset):

CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "species", value = "dataset")

# but this fails (with ds="datasetKey", which is what we need to do):

CoordinateCleaner::cd_round(occs$data, lon = "decimalLongitude", lat = "decimalLatitude", ds = "datasetKey", value = "dataset")

# Error in rbind.data.frame(`1c5c3e48-7fc0-4d4f-96e6-c4df2c747f34` = list( : 
#  numbers of columns of arguments do not match
# In addition: There were 15 warnings (use warnings() to see them)

warnings()

# Warning messages:
# 1: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
# 2: In FUN(X[[i]], ...) : Dataset smaller than minimum test size
[...]