iobis / dto-bioflow-3.5

0 stars 0 forks source link

Marine data in GBIF #12

Open pieterprovoost opened 7 months ago

pieterprovoost commented 7 months ago

This is related to #6. There's some old work on identifying marine datasets in GBIF here. @silasprincipe I believe you also have something?

silasprincipe commented 6 months ago

Yes, I have some code used in the MPA Europe project. The original worked well for our purposes, but was not very efficient. I tried to make some improvements in the code below, but note that there is still a big bottleneck: verifying if the species name exists on WoRMS (and thus, if it's marine or not). Maybe if you use the full export of WoRMS you can make it much faster.

### Check percentage of marine data on GBIF datasets ###

# Load packages
library(rgbif)
library(tidyverse)
library(worrms)
library(terra)
library(storr)

# Establish an area to identify potential datasets
target <- vect(ext(0, 10, 50, 60))

# Retrieve all datasets in that area
ds_list <- occ_count(facet="datasetKey", 
                     geometry = geom(target, wkt = T),
                     facetLimit=200000)

# Create a storr to hold results
st <- storr_rds("temp_count")
st_names <- storr_rds("temp_name")

# Create a function to get the species name from worms and check if is marine
# THIS COULD BE MUCH FASTER IF USING THE WORMS FULL EXPORT
ret_name <- function(all_names) {

  done <- st_names$exists(all_names)

  sci_name <- all_names[!done]

  if (length(sci_name) > 0) {
    worms_names <- try(wm_records_names(sci_name), silent = T)
    if (class(worms_names)[1] == "try-error") {
      temp <- sapply(1:length(sci_name), function(x) {st_names$set(sci_name[x], NA); return(invisible(NULL))})
    } else {
      temp <- sapply(1:length(sci_name), function(x) {
        if (nrow(worms_names[[x]]) > 0) {
          value <- worms_names[[x]]$valid_AphiaID[1]
        } else {
          value <- NA
        }
        st_names$set(sci_name[x], value)
        return(invisible(NULL))
        })
    }
  }

  return(sapply(all_names, function(x) st_names$get(x)))
}

# Retrieve species list + count per dataset
for (i in cli::cli_progress_along(1:nrow(ds_list))) {
  dataset_species <- occ_count(facet="scientificName", 
                               # or scientificName, which will return names
                               # in that case, you need to change the name retriever function
                               datasetKey = ds_list[i,1],
                               facetLimit=200000)

  # Get only genus + species for matching with worms
  dataset_species$species <- sapply(dataset_species$scientificName, function(x){
    string <- paste(str_split_1(x, pattern = " ")[1:2], collapse = " ")
    trimws(gsub("NA|,|\\.", "", string))
  })
  dataset_species$species <- unname(dataset_species$species)

  st$set(ds_list[i,1], dataset_species)
}

# Verify if it's marine
for (i in 1:nrow(ds_list)) {
  cli::cli_alert("Retrieving names for dataset {cli::bg_cyan(paste0('#', i, ': ', ds_list[i,1]))}")

  ds_data <- st$get(as.character(ds_list[i,1]))

  batches <- split(ds_data$species, ceiling(seq_along(ds_data$species)/100))

  batches_results <- list()

  for (z in cli::cli_progress_along(1:length(batches))) {
    batches_results[[z]] <- unname(ret_name(batches[[z]]))
  }

  ds_data$aphiaid <- unlist(batches_results)

  st$set(ds_list[i,1], ds_data)
}

# Get percentage of marine
ds_marine_perc <- lapply(1:nrow(ds_list), function(ind){
  ds_count <- st$get(as.character(ds_list[ind,1]))
  ds_count <- ds_count %>%
    mutate(is_marine = ifelse(is.na(ds_count$aphiaid), 0, 1)) %>%
    group_by(is_marine) %>%
    summarise(total_count = sum(count)) %>%
    ungroup() %>%
    mutate(percent_count = (total_count * 100) / sum(total_count))
  ds_count$dataset <- as.character(ds_list[ind,1])
  ds_count
})

# Results
ds_marine_perc <- bind_rows(ds_marine_perc)
ds_marine_perc