biocore / gneiss

compositional data analysis toolbox
https://biocore.github.io/gneiss/
BSD 3-Clause "New" or "Revised" License
55 stars 28 forks source link

Getting None/NaN values in columns from `gneiss.composition.ilr_transform` #269

Closed jolespin closed 5 years ago

jolespin commented 5 years ago

I'm trying to figure out what is going wrong with this. It looks like everything is adding up but I'm getting a missing internal node it appears?

I'm trying to use the ilr_transform function with my ete3 tree that I've converted into a skbio tree.

import gneiss
from gneiss.composition import ilr_transform
import skbio
import ete3

ete3.__version__, skbio.__version__, gneiss.__version__
# ('3.1.1', '0.5.1', '0.4.4')

def name_ete_nodes(tree):
    intermediate_node_index = 1
    for node in tree.traverse():
        if not node.is_leaf():
            node.name = f"y{intermediate_node_index}"
            intermediate_node_index += 1
    return tree

def ete_to_skbio(tree):
    tree = name_ete_nodes(tree)
    return skbio.TreeNode.read(StringIO(tree.write(format=1, format_root_node=True)))

# Data
X = pd.DataFrame({'Otu000514': {0: 87, 1: 23, 2: 135, 3: 140, 4: 137}, 'Otu000001': {0: 3802, 1: 463, 2: 2023, 3: 2012, 4: 798}, 'Otu000038': {0: 685, 1: 0, 2: 539, 3: 30, 4: 66}, 'Otu000003': {0: 2175, 1: 4420, 2: 540, 3: 4759, 4: 4769}, 'Otu000326': {0: 0, 1: 5, 2: 0, 3: 29, 4: 24}, 'Otu000002': {0: 684, 1: 2011, 2: 395, 3: 6986, 4: 2097}, 'Otu000387': {0: 463, 1: 134, 2: 399, 3: 762, 4: 125}, 'Otu000043': {0: 88, 1: 34, 2: 218, 3: 13, 4: 81}, 'Otu000051': {0: 1, 1: 12, 2: 24, 3: 0, 4: 0}, 'Otu000011': {0: 411, 1: 705, 2: 839, 3: 215, 4: 315}, 'Otu000018': {0: 203, 1: 41, 2: 399, 3: 359, 4: 77}, 'Otu000028': {0: 460, 1: 56, 2: 200, 3: 9, 4: 22}, 'Otu000029': {0: 279, 1: 13, 2: 2155, 3: 208, 4: 40}, 'Otu000008': {0: 181, 1: 790, 2: 253, 3: 1489, 4: 348}, 'Otu000558': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, 'Otu000037': {0: 51, 1: 58, 2: 346, 3: 137, 4: 582}, 'Otu000004': {0: 1799, 1: 578, 2: 4988, 3: 343, 4: 10300}, 'Otu000502': {0: 0, 1: 0, 2: 0, 3: 3, 4: 0}, 'Otu000007': {0: 980, 1: 9, 2: 180, 3: 111, 4: 42}, 'Otu000033': {0: 114, 1: 2, 2: 219, 3: 77, 4: 42}})

# Get ete3 Tree
newick = '(((Otu000514:0.0120754,(((Otu000028:0.00630833,Otu000029:5e-09)0.74:0.0143396,(((Otu000033:0.00365568,((Otu000003:5e-09,(Otu000008:0.00412409,(Otu000326:0.0166569,Otu000002:5e-09)0.854:5e-09)0.911:0.0166884)0.79:0.0340517,(((Otu000018:5e-09,Otu000011:0.00867225)0.955:0.0391636,Otu000037:6e-09)1:0.187251,Otu000043:0.00470855)0.306:0.0154373)0.885:0.0419892)0.755:0.00952708,Otu000004:5e-09)0.836:0.0198734,(Otu000051:0.0249113,Otu000038:5e-09)0.97:0.0590295)0.964:0.0635214)0.862:0.0414846,(Otu000558:0.0419508,Otu000007:0.0207856)0.835:0.00881836)0.998:0.0755549)0.76:0.00394873,Otu000502:0.033458)0.781:0.0040675,Otu000387:0.0290984,Otu000001:0.00228968);'
tree_ete3 = ete3.Tree(newick=newick)

# Overlap
leaves = set(tree_ete3.get_leaf_names())
otus_in_dataframe = set(X.columns)
# print(len(leaves), len(otus_in_dataframe), len(set(leaves) & set(otus_in_dataframe)))
# 20 20 20

# Name internal nodes
tree_ete3 = name_ete_nodes(tree_ete3)
# print(tree_ete3.get_ascii())
#          /-Otu000514
#         |
#         |         /-Otu000028
#         |      /y7
#         |     |   \-Otu000029
#         |     |
#         |     |         /-Otu000033
#         |     |        |
#         |     |        |      /-Otu000003
#         |     |        |   /y13
#       /y3     |      /y11 |  |   /-Otu000008
#      |  |     |     |  |  |   \y15
#      |  |   /y5     |  |  |     |   /-Otu000326
#      |  |  |  |     |  |  |      \y17
#      |  |  |  |     |   \y12        \-Otu000002
#      |  |  |  |     |     |
#      |  |  |  |     |     |         /-Otu000018
#      |  |  |  |   /y9     |      /y18
#      |  |  |  |  |  |     |   /y16  \-Otu000011
#      |  |  |  |  |  |     |  |  |
#      |  |  |  |  |  |      \y14  \-Otu000037
#      |   \y4  |  |  |        |
#    /y2     |   \y8  |         \-Otu000043
#   |  |     |     |  |
#   |  |     |     |   \-Otu000004
#   |  |     |     |
#   |  |     |     |   /-Otu000051
#   |  |     |      \y10
#   |  |     |         \-Otu000038
#   |  |     |
# -y1  |     |   /-Otu000558
#   |  |      \y6
#   |  |         \-Otu000007
#   |  |
#   |   \-Otu000502
#   |
#   |--Otu000387
#   |
#    \-Otu000001

# Convert to skbio
tree_skbio = ete_to_skbio(tree_ete3)
# <TreeNode, name: y1, internal node count: 17, tips count: 20>

# ILR transform
ilr_transform(X+1, tree_skbio)

image

mortonjt commented 5 years ago

It looks like this is a multifurcating tree. If you make it bifurcation, that should solve the problem, namely

tree_skbio.bifurcate() from gneiss.uyil import rename_internal_nodes tree_skbio = rename_internal_nodes(tree_skbio)

On Mon, Nov 26, 2018, 3:22 PM Josh L. Espinoza <notifications@github.com wrote:

I'm trying to figure out what is going wrong with this. It looks like everything is adding up but I'm getting a missing internal node it appears?

I'm trying to use the ilr_transform function with my ete3 tree that I've converted into a skbio tree.

import gneissfrom gneiss.composition import ilr_transformimport skbioimport ete3

ete3.version, skbio.version, gneiss.version# ('3.1.1', '0.5.1', '0.4.4') def name_ete_nodes(tree): intermediate_node_index = 1 for node in tree.traverse(): if not node.is_leaf(): node.name = f"y{intermediate_node_index}" intermediate_node_index += 1 return tree def ete_to_skbio(tree): tree = name_ete_nodes(tree) return skbio.TreeNode.read(StringIO(tree.write(format=1, format_root_node=True)))

Data

X = pd.DataFrame({'Otu000514': {0: 87, 1: 23, 2: 135, 3: 140, 4: 137}, 'Otu000001': {0: 3802, 1: 463, 2: 2023, 3: 2012, 4: 798}, 'Otu000038': {0: 685, 1: 0, 2: 539, 3: 30, 4: 66}, 'Otu000003': {0: 2175, 1: 4420, 2: 540, 3: 4759, 4: 4769}, 'Otu000326': {0: 0, 1: 5, 2: 0, 3: 29, 4: 24}, 'Otu000002': {0: 684, 1: 2011, 2: 395, 3: 6986, 4: 2097}, 'Otu000387': {0: 463, 1: 134, 2: 399, 3: 762, 4: 125}, 'Otu000043': {0: 88, 1: 34, 2: 218, 3: 13, 4: 81}, 'Otu000051': {0: 1, 1: 12, 2: 24, 3: 0, 4: 0}, 'Otu000011': {0: 411, 1: 705, 2: 839, 3: 215, 4: 315}, 'Otu000018': {0: 203, 1: 41, 2: 399, 3: 359, 4: 77}, 'Otu000028': {0: 460, 1: 56, 2: 200, 3: 9, 4: 22}, 'Otu000029': {0: 279, 1: 13, 2: 2155, 3: 208, 4: 40}, 'Otu000008': {0: 181, 1: 790, 2: 253, 3: 1489, 4: 348}, 'Otu000558': {0: 0, 1: 0, 2: 0, 3: 0, 4: 0}, 'Otu000037': {0: 51, 1: 58, 2: 346, 3: 137, 4: 582}, 'Otu000004': {0: 1799, 1: 578, 2: 4988, 3: 343, 4: 10300}, 'Otu000502': {0: 0, 1: 0, 2: 0, 3: 3, 4: 0}, 'Otu000007': {0: 980, 1: 9, 2: 180, 3: 111, 4: 42}, 'Otu000033': {0: 114, 1: 2, 2: 219, 3: 77, 4: 42}})

Get ete3 Tree

newick = '(((Otu000514:0.0120754,(((Otu000028:0.00630833,Otu000029:5e-09)0.74:0.0143396,(((Otu000033:0.00365568,((Otu000003:5e-09,(Otu000008:0.00412409,(Otu000326:0.0166569,Otu000002:5e-09)0.854:5e-09)0.911:0.0166884)0.79:0.0340517,(((Otu000018:5e-09,Otu000011:0.00867225)0.955:0.0391636,Otu000037:6e-09)1:0.187251,Otu000043:0.00470855)0.306:0.0154373)0.885:0.0419892)0.755:0.00952708,Otu000004:5e-09)0.836:0.0198734,(Otu000051:0.0249113,Otu000038:5e-09)0.97:0.0590295)0.964:0.0635214)0.862:0.0414846,(Otu000558:0.0419508,Otu000007:0.0207856)0.835:0.00881836)0.998:0.0755549)0.76:0.00394873,Otu000502:0.033458)0.781:0.0040675,Otu000387:0.0290984,Otu000001:0.00228968);' tree_ete3 = ete3.Tree(newick=newick)

Overlap

leaves = set(tree_ete3.get_leaf_names()) otus_in_dataframe = set(X.columns)# print(len(leaves), len(otus_in_dataframe), len(set(leaves) & set(otus_in_dataframe)))# 20 20 20

Name internal nodes

tree_ete3 = name_ete_nodes(tree_ete3)# print(tree_ete3.get_ascii())# /-Otu000514# |# | /-Otu000028# | /y7# | | -Otu000029# | |# | | /-Otu000033# | | |# | | | /-Otu000003# | | | /y13# /y3 | /y11 | | /-Otu000008# | | | | | | \y15# | | /y5 | | | | /-Otu000326# | | | | | | | \y17# | | | | | \y12 -Otu000002# | | | | | |# | | | | | | /-Otu000018# | | | | /y9 | /y18# | | | | | | | /y16 -Otu000011# | | | | | | | | |# | | | | | | \y14 -Otu000037# | \y4 | | | |# /y2 | \y8 | -Otu000043# | | | | |# | | | | -Otu000004# | | | |# | | | | /-Otu000051# | | | \y10# | | | -Otu000038# | | |# -y1 | | /-Otu000558# | | \y6# | | -Otu000007# | |# | -Otu000502# |# |--Otu000387# |# -Otu000001

Convert to skbio

tree_skbio = ete_to_skbio(tree_ete3)# <TreeNode, name: y1, internal node count: 17, tips count: 20>

ILR transform

ilr_transform(X+1, tree_skbio)

[image: image] https://user-images.githubusercontent.com/9061708/49048195-c22a4400-f18e-11e8-83fb-9d706c5744ea.png

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/biocore/gneiss/issues/269, or mute the thread https://github.com/notifications/unsubscribe-auth/AD_a3Qk8T7MPM9Wh2rffIXMhqRfJjjKzks5uzHeogaJpZM4Y0BN_ .

jolespin commented 5 years ago

That's exactly what I needed. Thanks Jamie!

tree_ete3.resolve_polytomy()