Open pieterprovoost opened 7 months ago
Yes, I have some code used in the MPA Europe project. The original worked well for our purposes, but was not very efficient. I tried to make some improvements in the code below, but note that there is still a big bottleneck: verifying if the species name exists on WoRMS (and thus, if it's marine or not). Maybe if you use the full export of WoRMS you can make it much faster.
### Check percentage of marine data on GBIF datasets ###
# Load packages
library(rgbif)
library(tidyverse)
library(worrms)
library(terra)
library(storr)
# Establish an area to identify potential datasets
target <- vect(ext(0, 10, 50, 60))
# Retrieve all datasets in that area
ds_list <- occ_count(facet="datasetKey",
geometry = geom(target, wkt = T),
facetLimit=200000)
# Create a storr to hold results
st <- storr_rds("temp_count")
st_names <- storr_rds("temp_name")
# Create a function to get the species name from worms and check if is marine
# THIS COULD BE MUCH FASTER IF USING THE WORMS FULL EXPORT
ret_name <- function(all_names) {
done <- st_names$exists(all_names)
sci_name <- all_names[!done]
if (length(sci_name) > 0) {
worms_names <- try(wm_records_names(sci_name), silent = T)
if (class(worms_names)[1] == "try-error") {
temp <- sapply(1:length(sci_name), function(x) {st_names$set(sci_name[x], NA); return(invisible(NULL))})
} else {
temp <- sapply(1:length(sci_name), function(x) {
if (nrow(worms_names[[x]]) > 0) {
value <- worms_names[[x]]$valid_AphiaID[1]
} else {
value <- NA
}
st_names$set(sci_name[x], value)
return(invisible(NULL))
})
}
}
return(sapply(all_names, function(x) st_names$get(x)))
}
# Retrieve species list + count per dataset
for (i in cli::cli_progress_along(1:nrow(ds_list))) {
dataset_species <- occ_count(facet="scientificName",
# or scientificName, which will return names
# in that case, you need to change the name retriever function
datasetKey = ds_list[i,1],
facetLimit=200000)
# Get only genus + species for matching with worms
dataset_species$species <- sapply(dataset_species$scientificName, function(x){
string <- paste(str_split_1(x, pattern = " ")[1:2], collapse = " ")
trimws(gsub("NA|,|\\.", "", string))
})
dataset_species$species <- unname(dataset_species$species)
st$set(ds_list[i,1], dataset_species)
}
# Verify if it's marine
for (i in 1:nrow(ds_list)) {
cli::cli_alert("Retrieving names for dataset {cli::bg_cyan(paste0('#', i, ': ', ds_list[i,1]))}")
ds_data <- st$get(as.character(ds_list[i,1]))
batches <- split(ds_data$species, ceiling(seq_along(ds_data$species)/100))
batches_results <- list()
for (z in cli::cli_progress_along(1:length(batches))) {
batches_results[[z]] <- unname(ret_name(batches[[z]]))
}
ds_data$aphiaid <- unlist(batches_results)
st$set(ds_list[i,1], ds_data)
}
# Get percentage of marine
ds_marine_perc <- lapply(1:nrow(ds_list), function(ind){
ds_count <- st$get(as.character(ds_list[ind,1]))
ds_count <- ds_count %>%
mutate(is_marine = ifelse(is.na(ds_count$aphiaid), 0, 1)) %>%
group_by(is_marine) %>%
summarise(total_count = sum(count)) %>%
ungroup() %>%
mutate(percent_count = (total_count * 100) / sum(total_count))
ds_count$dataset <- as.character(ds_list[ind,1])
ds_count
})
# Results
ds_marine_perc <- bind_rows(ds_marine_perc)
ds_marine_perc
This is related to #6. There's some old work on identifying marine datasets in GBIF here. @silasprincipe I believe you also have something?