The following is some experimental, unoptimised code for matching names in a phylogenetic tree in newick format:
def substitute_name_in_tree(tree_string: str, old_name: str, new_name: str):
# Note ape will not read spaces, so add underscores back to names
tree_string = re.sub(r'\b{}(?=;|:)'.format(old_name), new_name.replace(' ', '_'), tree_string)
return tree_string
def relabel_tree():
f = open(tree_file, "r")
tree_string = f.readline()
# From https://stackoverflow.com/questions/45668107/python-regex-parsing-newick-format
rx = r'[(),]+([^;:]+)\b'
name_list = re.findall(rx, tree_string)
binomial_names = [get_binomial_from_label(x) for x in name_list]
zipped = list(zip(name_list, binomial_names))
df = pd.DataFrame(zipped, columns=['tree_name', 'binomial_name'])
acc_name_df = get_accepted_info_from_names_in_column(df, 'binomial_name', match_level='fuzzy')
acc_name_df.to_csv(os.path.join('inputs', 'acc_name_tree.csv'))
acc_name_df = pd.read_csv(os.path.join('inputs', 'acc_name_tree.csv'), index_col=0)
# Catch words in tree string by left hand word boundaries (generic) and right hand ; or : characters
for index, row in acc_name_df.iterrows():
print(f'{index} out of {len(acc_name_df)}')
if isinstance(row[wcvp_accepted_columns['name']], str):
tree_string = substitute_name_in_tree(tree_string, row['tree_name'], row[wcvp_accepted_columns['name']])
else:
# If not matched, use old name. This could be changed to a generic string to be dropped later.
tree_string = substitute_name_in_tree(tree_string, row['tree_name'], row['binomial_name'])
f = open(standard_tree_file, "w")
f.writelines([tree_string])
However, in this case resolution requirements may be different depending on planned phylogenetic analyses. For example resolving species with misspelled/unknown species epithets to genera (as in 'full' matching) could cause issues if trying to induce genus-level subtrees.
The following is some experimental, unoptimised code for matching names in a phylogenetic tree in newick format:
However, in this case resolution requirements may be different depending on planned phylogenetic analyses. For example resolving species with misspelled/unknown species epithets to genera (as in 'full' matching) could cause issues if trying to induce genus-level subtrees.