For the last 7 months, any time one has a study with new APC synonyms (or new APNI names), one has to manually rebuild the taxon_list to include a few additional lines of information from the taxonomic resources files. This has led to increasing conflicts between branches, where the same synonym is added twice (causing data duplication later) or alternatively synonyms aren't being fully updated to their current names.
I propose having the taxon_list a priori include all APC accepted taxa (accepted names and all the various synonyms, orthographic variants, etc.). Additional APNI names (quite few) may still need to be added more manually as will new "names" that are genus-level identifications for a specific dataset. I used the following code to rebuild the taxon list - but at the moment I am just keeping a local copy of this taxon list to avoid a large commit.
# top bit of code only gets rerun when new APC, APNI lists uploaded
# this now adds all APC taxa to taxon_list, instead of constantly needing to add new synonyms as they are used
target <- c("accepted", "taxonomic_synonym", "nomenclatural synonym", "orthographic variant", "basionym", "pro parte taxonomic synonym", "alternative name", "misapplied", "pro parte misapplied",
"replaced synonym", "doubtful taxonomic synonym", "doubtful pro parte taxonomic synonym", "doubtful misapplied", "doubtful pro parte misapplied", "excluded")
taxonomic_resources$APC %>%
arrange(factor(taxonomicStatus, levels = target)) %>%
distinct(canonicalName, taxonomicStatus, .keep_all = TRUE) %>%
group_by(canonicalName) %>%
mutate(
cleaned_name_taxonomic_status = first(taxonomicStatus),
cleaned_name_alternative_taxonomic_status = paste(taxonomicStatus, collapse = " | "),
cleaned_name_alternative_taxonomic_status = ifelse(cleaned_name_alternative_taxonomic_status == taxonomicStatus, NA, cleaned_name_alternative_taxonomic_status),
cleaned_name_alternative_taxonomic_status = stringr::str_replace(cleaned_name_alternative_taxonomic_status, "accepted \\| ",""),
cleaned_name_alternative_taxonomic_status = stringr::str_replace(cleaned_name_alternative_taxonomic_status, " \\| accepted",""),
) %>%
ungroup() %>%
distinct(canonicalName, .keep_all = TRUE) %>%
dplyr::select(cleaned_name = canonicalName,
cleaned_scientific_name_id = scientificNameID,
cleaned_name_taxonomic_status,
cleaned_name_alternative_taxonomic_status,
taxonomic_reference = datasetName,
scientific_name_id = acceptedNameUsageID,
taxon_id = taxonID,
taxonomic_status = taxonomicStatus,
scientific_name = scientificName,
scientific_name_authorship = scientificNameAuthorship,
family,
taxon_distribution = taxonDistribution,
taxon_rank = taxonRank,
taxon_name_tmp = acceptedNameUsage
) %>%
filter(taxon_rank %in% c("Familia", "Genus", "Species", "Subspecies", "Varietas", "Forma", "Nothovarietas")) %>%
dplyr::mutate(
taxon_name = taxonomic_resources$APC$canonicalName[match(taxon_name_tmp, taxonomic_resources$APC$scientificName)],
distribution_tmp = stringr::str_replace(.data$taxon_distribution, "doubtfully ", ""),
count_naturalised = stringr::str_count(.data$distribution_tmp, "naturalised"),
count_n_and_n = stringr::str_count(.data$distribution_tmp, "native and naturalised"),
count_states = stringr::str_count(.data$distribution_tmp, ",") + 1,
establishment_means = ifelse(.data$count_naturalised > 0 & .data$count_n_and_n == 0, "naturalised", NA),
establishment_means = ifelse(.data$count_n_and_n > 0 | (.data$count_naturalised > 0 & .data$count_states > .data$count_naturalised), "native and naturalised", .data$establishment_means),
establishment_means = ifelse(.data$count_naturalised == 0 & .data$count_n_and_n == 0, "native", .data$establishment_means),
) %>%
select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship,
taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> all_APC
# from here need to re-run each time you have a study with genus-level identifications
# XX need to add code to also add APNI names to taxon_list. I haven't been doing that regularly.
read_csv("config/taxon_list.csv") -> taxon_list
taxon_list %>% filter(taxonomic_reference == "APNI") %>%
filter(!cleaned_name %in% all_APC$cleaned_name) %>%
distinct(cleaned_name, .keep_all = TRUE) %>%
select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship,
taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> APNI
taxon_list %>% filter(taxonomic_reference == "APC") %>% filter(stringr::str_detect(cleaned_name, "\\[")) %>%
select(cleaned_name, taxonomic_reference, cleaned_scientific_name_id, cleaned_name_taxonomic_status, cleaned_name_alternative_taxonomic_status, taxon_name, taxon_id, scientific_name_authorship,
taxon_rank, taxonomic_status, family, taxon_distribution, establishment_means, scientific_name, scientific_name_id) -> extras
austraits$taxa %>%
filter(stringr::str_detect(taxon_name, "\\[")) -> renamed
renamed %>%
filter(!taxon_name %in% extras$cleaned_name) %>%
mutate(
cleaned_name_alternative_taxonomic_status = NA,
cleaned_name = taxon_name,
taxon_name = word(taxon_name, 1),
taxon_distribution = NA,
establishment_means = NA,
cleaned_scientific_name_id = scientific_name_id,
cleaned_name_taxonomic_status = taxonomic_status
) %>%
select(
cleaned_name,
taxonomic_reference,
cleaned_scientific_name_id,
cleaned_name_taxonomic_status,
cleaned_name_alternative_taxonomic_status,
taxon_name,
taxon_id,
scientific_name_authorship,
taxon_rank,
taxonomic_status,
family,
taxon_distribution,
establishment_means,
scientific_name,
scientific_name_id
) -> to_add
all_APC %>%
bind_rows(APNI) %>%
bind_rows(extras) %>%
bind_rows(to_add) %>%
distinct(cleaned_name, .keep_all = TRUE) %>%
write_csv("config/taxon_list.csv")
For the last 7 months, any time one has a study with new APC synonyms (or new APNI names), one has to manually rebuild the taxon_list to include a few additional lines of information from the taxonomic resources files. This has led to increasing conflicts between branches, where the same synonym is added twice (causing data duplication later) or alternatively synonyms aren't being fully updated to their current names.
I propose having the taxon_list a priori include all APC accepted taxa (accepted names and all the various synonyms, orthographic variants, etc.). Additional APNI names (quite few) may still need to be added more manually as will new "names" that are genus-level identifications for a specific dataset. I used the following code to rebuild the taxon list - but at the moment I am just keeping a local copy of this taxon list to avoid a large commit.