traitecoevo / austraits.build

Source for AusTraits
Other
16 stars 2 forks source link

Possible new scheme to build taxon.csv #736

Closed ehwenk closed 2 months ago

ehwenk commented 1 year ago

For the last 7 months, any time one has a study with new APC synonyms (or new APNI names), one has to manually rebuild the taxon_list to include a few additional lines of information from the taxonomic resources files. This has led to increasing conflicts between branches, where the same synonym is added twice (causing data duplication later) or alternatively synonyms aren't being fully updated to their current names.

I propose having the taxon_list a priori include all APC accepted taxa (accepted names and all the various synonyms, orthographic variants, etc.). Additional APNI names (quite few) may still need to be added more manually as will new "names" that are genus-level identifications for a specific dataset. I used the following code to rebuild the taxon list - but at the moment I am just keeping a local copy of this taxon list to avoid a large commit.

# top bit of code only gets rerun when new APC, APNI lists uploaded
# this now adds all APC taxa to taxon_list, instead of constantly needing to add new synonyms as they are used

target <- c("accepted", "taxonomic_synonym", "nomenclatural synonym", "orthographic variant", "basionym", "pro parte taxonomic synonym", "alternative name", "misapplied", "pro parte misapplied", 
            "replaced synonym", "doubtful taxonomic synonym", "doubtful pro parte taxonomic synonym", "doubtful misapplied", "doubtful pro parte misapplied", "excluded")

taxonomic_resources$APC %>%
  arrange(factor(taxonomicStatus, levels = target)) %>%
  distinct(canonicalName, taxonomicStatus, .keep_all = TRUE) %>%
  group_by(canonicalName) %>%
  mutate(
    cleaned_name_taxonomic_status = first(taxonomicStatus),
    cleaned_name_alternative_taxonomic_status = paste(taxonomicStatus, collapse = " | "),
    cleaned_name_alternative_taxonomic_status = ifelse(cleaned_name_alternative_taxonomic_status == taxonomicStatus, NA, cleaned_name_alternative_taxonomic_status),
    cleaned_name_alternative_taxonomic_status = stringr::str_replace(cleaned_name_alternative_taxonomic_status, "accepted \\| ",""),
    cleaned_name_alternative_taxonomic_status = stringr::str_replace(cleaned_name_alternative_taxonomic_status, " \\| accepted",""),
  ) %>%
  ungroup() %>% 
  distinct(canonicalName, .keep_all = TRUE) %>% 
  dplyr::select(cleaned_name = canonicalName, 
                cleaned_scientific_name_id = scientificNameID,
                cleaned_name_taxonomic_status,
                cleaned_name_alternative_taxonomic_status,
                taxonomic_reference = datasetName, 
                scientific_name_id = acceptedNameUsageID,
                taxon_id = taxonID,
                taxonomic_status = taxonomicStatus,  
                scientific_name = scientificName,
                scientific_name_authorship = scientificNameAuthorship, 
                family,
                taxon_distribution = taxonDistribution, 
                taxon_rank = taxonRank,
                taxon_name_tmp = acceptedNameUsage
  ) %>%
  filter(taxon_rank %in% c("Familia", "Genus", "Species", "Subspecies", "Varietas", "Forma", "Nothovarietas")) %>%
  dplyr::mutate(
    taxon_name = taxonomic_resources$APC$canonicalName[match(taxon_name_tmp, taxonomic_resources$APC$scientificName)],
    distribution_tmp = stringr::str_replace(.data$taxon_distribution, "doubtfully ", ""),
    count_naturalised = stringr::str_count(.data$distribution_tmp, "naturalised"),
    count_n_and_n = stringr::str_count(.data$distribution_tmp, "native and naturalised"),
    count_states = stringr::str_count(.data$distribution_tmp, ",") + 1,
    establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
    establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
    establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
  ) %>%
  select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship, 
         taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> all_APC

# from here need to re-run each time you have a study with genus-level identifications
# XX need to add code to also add APNI names to taxon_list. I haven't been doing that regularly.

read_csv("config/taxon_list.csv") -> taxon_list

taxon_list %>% filter(taxonomic_reference == "APNI") %>%
  filter(!cleaned_name %in% all_APC$cleaned_name) %>%
  distinct(cleaned_name, .keep_all = TRUE) %>%
  select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship, 
         taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> APNI

taxon_list  %>% filter(taxonomic_reference == "APC") %>% filter(stringr::str_detect(cleaned_name, "\\[")) %>%
  select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship, 
         taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> extras

austraits$taxa %>%
  filter(stringr::str_detect(taxon_name, "\\[")) -> renamed

renamed %>% 
  filter(!taxon_name %in% extras$cleaned_name)  %>%
  mutate(
    cleaned_name_alternative_taxonomic_status = NA,
    cleaned_name = taxon_name,
    taxon_name = word(taxon_name, 1),
    taxon_distribution = NA,
    establishment_means = NA,
    cleaned_scientific_name_id = scientific_name_id,
    cleaned_name_taxonomic_status = taxonomic_status
  ) %>%
  select(
    cleaned_name,
    taxonomic_reference,
    cleaned_scientific_name_id,
    cleaned_name_taxonomic_status,
    cleaned_name_alternative_taxonomic_status,
    taxon_name,
    taxon_id,
    scientific_name_authorship,
    taxon_rank,
    taxonomic_status,
    family,
    taxon_distribution,
    establishment_means,
    scientific_name,
    scientific_name_id
  ) -> to_add

all_APC %>%
  bind_rows(APNI) %>%
  bind_rows(extras) %>% 
  bind_rows(to_add) %>% 
  distinct(cleaned_name, .keep_all = TRUE) %>%
  write_csv("config/taxon_list.csv")