joey711 / phyloseq

phyloseq is a set of classes, wrappers, and tools (in R) to make it easier to import, store, and analyze phylogenetic sequencing data; and to reproducibly share that data and analysis with others. See the phyloseq front page:
http://joey711.github.io/phyloseq/
578 stars 187 forks source link

calculate top 5 most abundant genera for each site #1487

Open Avtober opened 3 years ago

Avtober commented 3 years ago

Hi I have 5 sites and would like to calculate the top five most abundant genera for each site. I have found code that does this for the who data set but not for each site/group?

This is the code if found

trimlist = names(sort(taxa_sums(pm), TRUE)[1:5])

What would I need to add to this to find the top 5 for each site? Thanks Anya

ycl6 commented 3 years ago

Hi @Avtober, below is an example and you can improve on it, e.g. run it on all your sites (types) with a loop.

library(phyloseq)
data(GlobalPatterns)

# Merges ASVs that have the same taxonomy rank (Genus)
gp = tax_glom(GlobalPatterns, taxrank = "Genus")

# Define group/condition type
type = levels(sample_data(gp)$SampleType)[1] # Feces

# Retrieve sample name of the defined type
sn = sample_names(sample_data(gp)[sample_data(gp)$SampleType == type,])

# Calculate taxa sum of the selected samples
top5 = head(sort(rowSums(otu_table(gp)[,sn]), decreasing = TRUE), 5)

# Combine count and taxonomyTable
top5 = cbind(as.data.frame(tax_table(gp)[names(top5),]), Count = top5)
> top5
        Kingdom         Phylum          Class             Order             Family            Genus Species   Count
331820 Bacteria  Bacteroidetes    Bacteroidia     Bacteroidales     Bacteroidaceae      Bacteroides    <NA> 1957463
171551 Bacteria     Firmicutes     Clostridia     Clostridiales    Ruminococcaceae Faecalibacterium    <NA>  601689
259569 Bacteria  Bacteroidetes    Bacteroidia     Bacteroidales      Rikenellaceae        Alistipes    <NA>  171096
326977 Bacteria Actinobacteria Actinobacteria Bifidobacteriales Bifidobacteriaceae  Bifidobacterium    <NA>  156041
357795 Bacteria     Firmicutes     Clostridia     Clostridiales    Lachnospiraceae        Roseburia    <NA>  138515
type = levels(sample_data(gp)$SampleType)[2] # Freshwater
sn = sample_names(sample_data(gp)[sample_data(gp)$SampleType == type,])
top5 = head(sort(rowSums(otu_table(gp)[,sn]), decreasing = TRUE), 5)
top5 = cbind(as.data.frame(tax_table(gp)[names(top5),]), Count = top5)
> top5
        Kingdom         Phylum                 Class            Order            Family            Genus Species  Count
279599 Bacteria  Cyanobacteria      Nostocophycideae       Nostocales       Nostocaceae   Dolichospermum    <NA> 915358
39294  Bacteria  Bacteroidetes         Flavobacteria Flavobacteriales Flavobacteriaceae   Flavobacterium    <NA> 133499
324539 Bacteria Proteobacteria    Betaproteobacteria  Burkholderiales  Burkholderiaceae Polynucleobacter    <NA>  50995
557211 Bacteria  Cyanobacteria Synechococcophycideae  Synechococcales  Synechococcaceae  Prochlorococcus    <NA>  48384
582407 Bacteria Proteobacteria    Betaproteobacteria  Burkholderiales    Comamonadaceae    Limnohabitans    <NA>  44654
Avtober commented 3 years ago

Great thank you @ycl6 !!

kusandeep commented 3 years ago

Hi @ycl6, Could you please take this one step further and help me to plot stacked bar graph of top 5 genus (relative abundance)..? A very beginner's request. Thanks and keep up the good work. Sandeep

ycl6 commented 3 years ago

Hi @kusandeep Instead of calculating total relatvie abundance, I am calculating average relatvie abundance (to take the number of samples in a SampleType into account). I suppose average count in the original answer will also make more sense if you want to compare between different SampleType.

The plot itself is generated simply by using ggplot2.

library(phyloseq)
library(ggplot2)
data(GlobalPatterns)

# Merges ASVs that have the same taxonomy rank (Genus)
gp = tax_glom(GlobalPatterns, taxrank = "Genus")

# Calculate relative abundance
rb = transform_sample_counts(gp, function(x) {x/sum(x)} )

# Define group/condition type
type = levels(sample_data(rb)$SampleType)[1] # Feces

# Retrieve sample name of the defined type
sn = sample_names(sample_data(rb)[sample_data(rb)$SampleType == type,])

# Calculate average taxa relative abundance of the selected samples
top5 = head(sort(rowSums(otu_table(rb)[,sn])/length(sn), decreasing = TRUE), 5)

# Combine count and taxonomyTable
top5 = cbind(as.data.frame(tax_table(rb)[names(top5),]), Frac = top5)

top5$Type = type
> top5
        Kingdom         Phylum          Class             Order             Family            Genus Species       Frac  Type
331820 Bacteria  Bacteroidetes    Bacteroidia     Bacteroidales     Bacteroidaceae      Bacteroides    <NA> 0.38863368 Feces
171551 Bacteria     Firmicutes     Clostridia     Clostridiales    Ruminococcaceae Faecalibacterium    <NA> 0.20210600 Feces
326977 Bacteria Actinobacteria Actinobacteria Bifidobacteriales Bifidobacteriaceae  Bifidobacterium    <NA> 0.07313059 Feces
223059 Bacteria     Firmicutes     Clostridia     Clostridiales    Ruminococcaceae     Ruminococcus    <NA> 0.04230443 Feces
357795 Bacteria     Firmicutes     Clostridia     Clostridiales    Lachnospiraceae        Roseburia    <NA> 0.04049559 Feces
# Plot
ggplot(top5, aes(Type,  Frac, fill = Genus)) + geom_col(color = "black", size = 1) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) + ylab("Average relative abundance")

plot_bar_average_frac

kusandeep commented 3 years ago

Many Thanks @ycl6 , However I'm trying to reproduce this and getting the error below.. at sample name step

sn = sample_names(sample_data(rb)[sample_data(rb)$SampleType == type,]) Error in h(simpleError(msg, call)) : error in evaluating the argument 'physeq' in selecting a method for function 'sample_names': error in evaluating the argument 'i' in selecting a method for function '[': object 'type' not found

Also I would like to show sample to sample variation within one sample type/group (I don't have any samples so the graph will still be readable) so wondering what's most efficient way to do this. Much appreciated for you help.

ycl6 commented 3 years ago

Hi @kusandeep , missed a line of code, I've updated my reply.

If I understood your question correctly, instead of calculate the average abundance, you can use a boxplot to show the relative abundance of all your samples of a particular sample type and your X-axis can be the 5 genera. Or if you have an example plot to demonstrate

kusandeep commented 3 years ago

Thanks for quick reply @ycl6 . See the pic below for example. image

So 5 genera would be in the legend and all other genera would ideally get grouped in "Other" (like in the picture) if it make sense..

ycl6 commented 3 years ago

Hi @kusandeep Probably something like this:

library(phyloseq)
library(ggplot2)
library(tidyverse)
data(GlobalPatterns)

# Merges ASVs that have the same taxonomy rank (Genus)
gp = tax_glom(GlobalPatterns, taxrank = "Genus")

# Calculate relative abundance
rb = transform_sample_counts(gp, function(x) {x/sum(x)} )

# Define group/condition type
type = levels(sample_data(rb)$SampleType)[1] # Feces

# Retrieve sample name of the defined type
sn = sample_names(sample_data(rb)[sample_data(rb)$SampleType == type,])

# Retrieve top5 genera names
top5 = names(head(sort(rowSums(otu_table(rb)[,sn]), decreasing = TRUE), 5)) 

# Remove unselected samples and taxa
rb = prune_samples(sample_names(rb) %in% sn, rb)
rb = prune_taxa(top5, rb)

# Build data.frame
df = cbind(data.frame(ID = c(taxa_names(rb), "Other"),
                      Genus = c(tax_table(rb)[,"Genus"], "Other")),
           rbind(otu_table(rb), 1 - colSums(otu_table(rb))))
> str(df)
'data.frame':   6 obs. of  6 variables:
 $ ID     : chr  "326977" "331820" "357795" "223059" ...
 $ Genus  : chr  "Bifidobacterium" "Bacteroides" "Roseburia" "Ruminococcus" ...
 $ M31Fcsw: num  0.000638 0.545018 0.02614 0.016306 0.126892 ...
 $ M11Fcsw: num  0.00026 0.72126 0.02197 0.01372 0.04353 ...
 $ TS28   : num  0.0408 0.1808 0.095 0.1118 0.3286 ...
 $ TS29   : num  0.2508 0.1074 0.0189 0.0274 0.3094 ...
# Order data.frame by genera abundance, "Other" placed at the last
df = df[match(c(top5, "Other"), df$ID),]
df = df %>% mutate_at(vars(Genus), factor) %>% mutate(Genus = factor(Genus, levels = df$Genus))
> str(df)
'data.frame':   6 obs. of  6 variables:
 $ ID     : chr  "331820" "171551" "326977" "223059" ...
 $ Genus  : Factor w/ 6 levels "Bacteroides",..: 1 2 3 4 5 6
 $ M31Fcsw: num  0.545018 0.126892 0.000638 0.016306 0.02614 ...
 $ M11Fcsw: num  0.72126 0.04353 0.00026 0.01372 0.02197 ...
 $ TS28   : num  0.1808 0.3286 0.0408 0.1118 0.095 ...
 $ TS29   : num  0.1074 0.3094 0.2508 0.0274 0.0189 ...

> levels(df$Genus)
[1] "Bacteroides"      "Faecalibacterium" "Bifidobacterium"  "Ruminococcus"    
[5] "Roseburia"        "Other"           
# Plot
pivot_longer(df, cols = -c(ID, Genus), names_to = "Sample", values_to = "Abundance") %>%
        ggplot(aes(Sample,  Abundance, fill = Genus)) + geom_col(color = "black") +
        scale_fill_brewer(palette = "Dark2") + scale_y_continuous(breaks = seq(0, 1, 0.1)) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
        labs(title = type, y = "Average relative abundance")

plot_bar_relabd

kusandeep commented 3 years ago

Hi @ycl6, Yes looking great. just one last things. if we want to show two or more groups/sample types like Feces and Soil in one graph (like the one in the picture, I sent in my last message), what are the changes requires in that case..? Much appreciated for the help.

kusandeep commented 3 years ago

Also When I am trying type = levels(sample_data(rb)$SampleType)[1] then the output is NULL which I think is because of, in GlobalPatterns Sample_data are stored as factor but in my own dateset it is stored as character. See the pics below. and appreciates any solution. image

ycl6 commented 3 years ago

@kusandeep It depends how you want the final output to be like. The easiest way is to generate and store the ggplot objects in variables. Then combine them into one image, like the tutorial described here.

If you want to combine the top 5 genera from each of the groups/types and create a single plot, then you will need to (1) build the individual data.frame, (2) rbind the melted/lengthened data.frames (pivot_longer) into a single data.frame, (3; optional) order the genera according to your desired order/levels, (4) then plot using ggplot.

the output is NULL

There are many ways to produce the plot that I showed above, and possibly with few lines of code. Mine was meant to show , in a rather systematic way, what was achieved at each step. You'll need to adapt the script to suit your data structure.

If you want, you can assign it with the group/type you want to show, e.g. type = "Britain", or change your sample_data so that the categorical data are stored as factor variable. Or if you want to keep it as character variable, you can use unique to get a list of unique strings. It's up to you.

kusandeep commented 3 years ago

Thanks @ycl6 for detailed instructions now I feel I have all the information needed for visualisation.

pauGuas commented 2 years ago

@ycl6 I am having trouble applying this to my data. I just need to know the top 8 genera of each sample.

abroryp2 commented 2 years ago

Hey, I am new to microbiome data analysis. In my case, I would like to make a boxplot based on specific taxa that I would like to know, for example, I would like to make a barplot but only for Roseburia. Could you help me with how to make it?

This is one of example that I have tried. bacteroides<-subset_taxa(phy.best.hit,Genus=="Roseburia"|Genus=="Bacteroides") phy.select.taxa1<-plot_listed_taxa(bacteroides, group = "group", group.colors=mycols, add.violin=FALSE, violin.opacity = 0.3, dot.opacity = 0.25, box.opacity = 0.25, panel.arrange = "wrap")+ scale_y_continuous(labels=scales::percent)

But then, I got an error massage. Error in h(simpleError(msg, call)) : error in evaluating the argument 'taxa' in selecting a method for function 'prune_taxa': argument "select.taxa" is missing, with no default.

Can you help me to figure it out?

Thanks.

Mehrdad-M3614 commented 11 months ago

Hello @ycl6 Thanks for making things here clear for calculating calculate top 5 most abundant genera. I was trying to follow your direction for my microbiome data, however, when I was trying to alculate taxa sum of the selected samples using the code: top5 <- head(sort(rowSums(otu_table(gp)[,sn]), decreasing = TRUE), 5) It gave me this Error: Error in h(simpleError(msg, call)) : error in evaluating the argument 'x' in selecting a method for function 'head': error in evaluating the argument 'x' in selecting a method for function 'sort': error in evaluating the argument 'x' in selecting a method for function 'rowSums': subscript out of bounds

Here is how I modify the code based on my data:

1 Merges ASVs that have the same taxonomy rank (Genus)

fungi.ps.mergedASVs <- tax_glom(fungi.ps.pruned, taxrank = "Phylum") fungi.ps.mergedASVs

2 check the level of variables

levels(sample_data(fungi.ps.mergedASVs)$Soil_Fraction) # [1] "Large","Particulate_Organic_Matter" "Small" class(sample_data(fungi.ps.mergedASVs)$Soil_Fraction) # "factor"

3 Check the levels of Soil_Fraction

levels(sample_data(fungi.ps.mergedASVs)$Soil_Fraction) # "Large"."Particulate_Organic_Matter","Small"

4 Define group/condition type

fungi.ps.type.Fraction <- levels(sample_data(fungi.ps.mergedASVs)$Soil_Fraction)[1] # Large

5 Retrieve sample name of the defined type

fungi.ps.Soil_Fraction.sn <- sample_names(sample_data(fungi.ps.mergedASVs)[sample_data(fungi.ps.mergedASVs)$Soil_Fraction == fungi.ps.type.Fraction,])

6 Calculate taxa sum of the selected samples/identify the top 5 taxa with the highest sum in the selected samples.

top5.Soil_Fraction <- head(sort(rowSums(otu_table(fungi.ps.mergedASVs)[,fungi.ps.Soil_Fraction.sn]), decreasing = TRUE), 5)

Here I got this Error: Error in h(simpleError(msg, call)) : error in evaluating the argument 'x' in selecting a method for function 'head': error in evaluating the argument 'x' in selecting a method for function 'sort': error in evaluating the argument 'x' in selecting a method for function 'rowSums': subscript out of bounds

I truly appreciate your help, and thanks in advance.

ycl6 commented 11 months ago

Hi @Mehrdad-M3614

You can back track and print out the results you obtain in each step to see if they make sense.

I think there might be a problem with your 5th step.

butterflyka commented 9 months ago

Hello @ycl6 Ive been trying to replicate your code as I trying to get very similar graphs, however Im running into the issue of having same Family multiple times in the top 5. Im using the code as you wrote it and I am guessing the issue comes up at merge_family(), however I dont really understand why. I can assume that the f_unclassified comes up twice as it is from different orders, but I dont understand why is it happening for f_Tylosporaceae. Could you maybe help me out? Many thanks!

image

image

ycl6 commented 9 months ago

Hi @butterflyka I can't tell without seeing the data. If you don't mind sharing a trimmed ps object containing ASVs from f_Tylosporaceae (and probably also those from f__Suillaceae) with me, I can have a look. Oh, and also your exact code used to merge taxa, perform top5 selection and print the barplot.

pdhrati02 commented 5 months ago

Hi @ycl6 , Thank you for the detailed explanation at each step in this post, I have a slightly related question. Your code about top5 for each group and rest as others works for me too. However, I have a longitudinal data set. Basically I have 10 subjects followed up over time. Now instead of top5 per time point for each subject, I wish to calculate top5 for each subject overall and mark rest as others in each case as I want to plot a sankey plot to see the trend. I tried to merge the phyloseq object by subject, but then realised that the time information will be lost and I wont be able to plot the data to show increase/decrease over time. I also tried to select names based on subject instead of sample names in phyloseq object but gives errors. I am wondering if this is possible to be done?

Can you please help me with this? Any help would be appreciated.

ycl6 commented 5 months ago

Hi @pdhrati02 Do you have a mock data or a subset of your data that you can share?

pdhrati02 commented 5 months ago

Hi @ycl6 , absolutely. I have added a small subset of the metadata and the taxa table in the excel file. example.xlsx

This is how I make the phyloseq object:

df1 <- mytaxafile metada <- mymetadatafile

metaphlanToPhyloseq <- function( tax, metadat=metada, simplenames=TRUE, roundtointeger=FALSE, split="|"){ xnames = rownames(tax) shortnames = gsub(paste0(".+\", split), "", xnames) if(simplenames){ rownames(tax) = shortnames } if(roundtointeger){ tax = round(tax * 1e4) } x2 = strsplit(xnames, split=split, fixed=TRUE) taxmat = matrix(NA, ncol=max(sapply(x2, length)), nrow=length(x2)) colnames(taxmat) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain")[1:ncol(taxmat)] rownames(taxmat) = rownames(tax) for (i in 1:nrow(taxmat)){ taxmat[i, 1:length(x2[[i]])] <- x2[[i]] } taxmat = gsub("[a-z]__", "", taxmat) taxmat = phyloseq::tax_table(taxmat) otutab = phyloseq::otu_table(tax, taxa_are_rows=TRUE) if(is.null(metadat)){ res = phyloseq::phyloseq(taxmat, otutab) }else{ res = phyloseq::phyloseq(taxmat, otutab, phyloseq::sample_data(metadat)) } return(res) }

ps1=metaphlanToPhyloseq(df1) pps1=ps1 pps1

I transform this by using transform_sample_counts and then proceed. Please do let me know if you need more details. I will be happy to share. Thank you very much for your help.

ycl6 commented 5 months ago

Hi @pdhrati02

Unrelated to your question, you may want to follow this guide here to create a code block with syntax highlighting. :wink: For example:

``` r x <- sum(c(1, 2, 3, 4)) x ```

So, I think this probably will achieve what you wanted, i.e. to obtain the top N (in the example top 5) taxa from each of your subjects with the taxa counts summed up from all the available time points from the same subject.

With the top N taxa identified, you can build the final data.frame the way you wanted. In the example I use psmelt and added a column to flag if a taxon is one of the top 5 taxa

p.s. There is a one sample that's different between your two Excel sheets, but I think that's just because this is a subset of the full data. But in general, it is still good to check if the samples and taxa from different input sources adds up.

library(readxl)
library(tidyverse)
library(phyloseq)

# Load data
path = "pdhrati02_example.xlsx"
x = excel_sheets(path) %>% set_names(c("otu","metadata")) %>% map(read_excel, path = path)

sam = as.data.frame(x$metadata)
rownames(sam) = sam$SampleID
dim(sam)
#> [1] 19  6

tax = as.data.frame(do.call(rbind, (strsplit(x$otu$clade_name, "\\|")))) %>%
        mutate(across(everything(),~ gsub("[a-z]__","", .)))
colnames(tax) = c("Kingdom","Phylum","Class","Order","Family","Genus","Species","Taxon")
rownames(tax) = tax$Taxon
dim(tax)
#> [1] 82  8

otu = as.data.frame(x$otu)
colnames(otu) = gsub("-", "_", colnames(otu))
rownames(otu) = tax$Taxon
otu = otu %>% select(-clade_name)
dim(otu)
#> [1] 82 20

# Restrict to common samples
samples = gtools::mixedsort(intersect(rownames(sam), colnames(otu)))

sam = sam[samples,]
dim(sam)
#> [1] 19  6

otu = otu[, samples]
dim(otu)
#> [1] 82 19

# Build ps
ps = phyloseq(otu_table(otu, taxa_are_rows = TRUE),
              tax_table(as.matrix(tax)), # requires matrix
              sample_data(sam))
ps # taxa are rows
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 82 taxa and 19 samples ]
#> sample_data() Sample Data:       [ 19 samples by 6 sample variables ]
#> tax_table()   Taxonomy Table:    [ 82 taxa by 8 taxonomic ranks ]

# Merge by ParticipantID
merged = merge_samples(ps, "ParticipantID")
#> Warning in asMethod(object): NAs introduced by coercion

#> Warning in asMethod(object): NAs introduced by coercion

#> Warning in asMethod(object): NAs introduced by coercion
merged # taxa are columns
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 82 taxa and 5 samples ]
#> sample_data() Sample Data:       [ 5 samples by 6 sample variables ]
#> tax_table()   Taxonomy Table:    [ 82 taxa by 8 taxonomic ranks ]

# Calculate relative abundance
ps.rb = transform_sample_counts(ps, function(x) {x/sum(x)} )
merged.rb = transform_sample_counts(merged, function(x) {x/sum(x)} )

# Retrieve top5 taxa names per ParticipantID
mat = otu_table(merged.rb)
top5 = list()
for(i in 1:nrow(mat)) {
        # Sort by relative abundance and select top 5
        top5[[rownames(mat)[i]]] = as.data.frame(t(mat[i,])) %>%
                arrange(desc(.[1])) %>% rownames %>% head(5)
}

# Print top5 list
top5
#> $`010a`
#> [1] "SGB17256"     "SGB10120"     "SGB10130"     "UNCLASSIFIED" "SGB7044"     
#> 
#> $`010b`
#> [1] "SGB7962"      "SGB82729"     "SGB6939"      "SGB10141"     "UNCLASSIFIED"
#> 
#> $`14`
#> [1] "SGB7865"      "SGB17256"     "SGB7044"      "UNCLASSIFIED" "SGB7144"     
#> 
#> $`15`
#> [1] "SGB17247"      "SGB17256"      "UNCLASSIFIED"  "SGB10165"     
#> [5] "SGB8007_group"
#> 
#> $`6`
#> [1] "SGB17256"     "SGB82729"     "SGB10183"     "SGB13896"     "UNCLASSIFIED"

# Build ps.rb data.frame
df = psmelt(ps.rb)
# Label if taxa is top5 or not
df$Top5 = FALSE
df$Top5[df$Taxon %in% unique(unlist(top5))] = TRUE

arrange(df, Sample) %>% select(c(1:9, ncol(df))) %>% head(10)
#>             OTU          Sample    Abundance        SampleID DOL ParticipantID
#> 1      SGB82729 127_006_ST_I_V1 6.093723e-01 127_006_ST_I_V1   6             6
#> 2      SGB13896 127_006_ST_I_V1 2.604270e-01 127_006_ST_I_V1   6             6
#> 3  UNCLASSIFIED 127_006_ST_I_V1 1.061460e-01 127_006_ST_I_V1   6             6
#> 4      SGB17256 127_006_ST_I_V1 2.191980e-02 127_006_ST_I_V1   6             6
#> 5       SGB7044 127_006_ST_I_V1 1.113653e-03 127_006_ST_I_V1   6             6
#> 6       SGB7865 127_006_ST_I_V1 7.744350e-04 127_006_ST_I_V1   6             6
#> 7       SGB7863 127_006_ST_I_V1 2.324020e-04 127_006_ST_I_V1   6             6
#> 8      SGB17248 127_006_ST_I_V1 1.144133e-05 127_006_ST_I_V1   6             6
#> 9      SGB16955 127_006_ST_I_V1 2.949720e-06 127_006_ST_I_V1   6             6
#> 10      EUK5480 127_006_ST_I_V1 0.000000e+00 127_006_ST_I_V1   6             6
#>    Participant Sampledetails week_eg  Top5
#> 1            6           6_6       1  TRUE
#> 2            6           6_6       1  TRUE
#> 3            6           6_6       1  TRUE
#> 4            6           6_6       1  TRUE
#> 5            6           6_6       1  TRUE
#> 6            6           6_6       1  TRUE
#> 7            6           6_6       1 FALSE
#> 8            6           6_6       1 FALSE
#> 9            6           6_6       1 FALSE
#> 10           6           6_6       1 FALSE

Created on 2024-03-21 with reprex v2.1.0

pdhrati02 commented 5 months ago

Hi @ycl6 , thank you for the guide and I will make sure to learn and follow it. And apologies for the wrong sample name, I must have made a silly mistake while subsetting. Also thank you for the detailed code, however I do have some follow up questions. When I do try this on my dataset it is not selecting just top5 but there tend to be many weird responses such as TRUE and FALSE at the same time for the same genera for one subject. The only change I made was replacing FALSE with others in the genus column if any. This is what I did:

Merge by ParticipantID

merged = merge_samples(PSf, "ParticipantID")

merged # taxa are columns

Calculate relative abundance

ps.rb = transform_sample_counts(PSf, function(x) {x/sum(x)} ) merged.rb = transform_sample_counts(merged, function(x) {x/sum(x)} )

Retrieve top5 taxa names per ParticipantID

mat = otu_table(merged.rb) top5 = list() for(i in 1:nrow(mat)) {

Sort by relative abundance and select top 5

top5[[rownames(mat)[i]]] = as.data.frame(t(mat[i,])) %>% arrange(desc(.[1])) %>% rownames %>% head(5) }

Print top5 list

top5

Build ps.rb data.frame

df = psmelt(ps.rb)

Label if taxa is top5 or not

df$Top5 = FALSE df$Top5[df$OTU %in% unique(unlist(top5))] = TRUE

x <- arrange(df, Sample) %>% select(c(c(1:11,61,63), ncol(df))) %>% head(10)

if false then replace genus as others

x$Genus[x$Top5 == "FALSE"] <- "Others"

try to subset per subject and observe

x1 <- subset(x, x$Subject=="10a") unique(x1$Genus) ##this number is 16 and remains same even if I change subject number in the above line

I think this is happening as df$Top5 is distributing to all except just grouped by per subject. This results in the following type of plot instead of just the 5 for that subject which was select in top5. image

My sincere apologies, but would you be able to help me solve this? Or point to me if I have done something wrong here? I am unable to figure it out. I will be happy to share the complete dataset with you via email if you would prefer.

Thank you very much for all your help and time.

ycl6 commented 5 months ago

Hi @pdhrati02 You may send me the full dataset if you are comfortable with it. My email is available if you follow the link on my GitHub profile.

Regarding the plot output, the top5 object in my code is a list and I use unique(unlist(top5)) to combined the top taxa from all subjects together, where some taxa are top5 in one subject, some may be top5 in multiple subjects. If you wish to select the top5 taxa from one specific subject, you can do top5[["010a"]] to limit the taxa to that from 010a for example.

pdhrati02 commented 5 months ago

Hi @ycl6 , yes you are right. The issue is that some taxa are top5 in one subject, some may be top5 in multiple subjects which is resulting in about 16 taxa now being part of the top5. Due to this the trend part is not easy to visualise in the figure. I have now sent you a detailed email. Thank you once again for your help.

ycl6 commented 5 months ago

Hi @pdhrati02

Here are some plots I came up with your data using the ggalluvial package.

As we found from my previous example, we can identify the top5 taxa from each of your sample. But how do you then use this data to create the plots? I suppose you need to define what you want to see in the final plot more clearly.

In the first example, we still combine all the top5 genera from all the subjects (I limit to 5 in my example), then use facet_wrap to create subplots for each subject but still tracks the combined genera.

In the second example, I am showing the top5 from one subject (#6), you can either (1) track their abundance in all subjects, or (2) show the top5 of that subject alone. You can then use the same code to loop through all your subjects to combine and build the full figure.

library(readxl)
library(tidyverse)
library(phyloseq)
library(ggalluvial)

# Load data
sam = "pdhrati02_meta.xlsx"
sam = as.data.frame(read_xlsx(sam))
rownames(sam) = sam$SampleID
sam$SampleID = factor(sam$SampleID, levels = gtools::mixedsort(sam$SampleID))
sam$SubjectID = factor(sam$SubjectID, levels = gtools::mixedsort(unique(sam$SubjectID)))
sam$Sampledetails = factor(sam$Sampledetails, levels = gtools::mixedsort(unique(sam$Sampledetails)))
dim(sam)
#> [1] 191   4

otu = "pdhrati02_otu.csv"
otu = read.csv(otu, header = TRUE, row.names = 1)
otu = otu[rownames(sam),] # fix different sample order
dim(otu)
#> [1]  191 1560

tax = "pdhrati02_tax.csv"
tax = read.csv(tax, header = TRUE, row.names = 1)
dim(tax)
#> [1] 1560    7

# Check names
identical(rownames(tax), colnames(otu))
#> [1] TRUE
identical(rownames(sam), rownames(otu))
#> [1] TRUE

# Build ps
ps = phyloseq(otu_table(otu, taxa_are_rows = FALSE),
              tax_table(as.matrix(tax)), # requires matrix
              sample_data(sam))
ps # taxa are columns
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 1560 taxa and 191 samples ]
#> sample_data() Sample Data:       [ 191 samples by 4 sample variables ]
#> tax_table()   Taxonomy Table:    [ 1560 taxa by 7 taxonomic ranks ]

# Rename taxa names for ease of reading
taxa_names(ps) = sprintf('ASV%0.4d', 1:length(taxa_names(ps)))

# Limit data to 5 samples for this example
ps = subset_samples(ps, SubjectID %in% c("6","10a","10b","13","14","15"))

# To plot 'alluvium = Genus' later
# Merges ASVs that have the same taxonomy rank (Genus)
genus = tax_glom(ps, taxrank = "Genus")

# Merge by SubjectID
merged = merge_samples(genus, "SubjectID")
#> Warning in asMethod(object): NAs introduced by coercion
merged # taxa are columns
#> phyloseq-class experiment-level object
#> otu_table()   OTU Table:         [ 201 taxa and 6 samples ]
#> sample_data() Sample Data:       [ 6 samples by 4 sample variables ]
#> tax_table()   Taxonomy Table:    [ 201 taxa by 7 taxonomic ranks ]

# Calculate relative abundance
genus.rb = transform_sample_counts(genus, function(x) {x/sum(x)} )
merged.rb = transform_sample_counts(merged, function(x) {x/sum(x)} )

# Retrieve top5 taxa names per SubjectID
mat = otu_table(merged.rb)
top5 = list()
for(i in 1:nrow(mat)) {
        # Sort by relative abundance and select top 5
        top5[[rownames(mat)[i]]] = as.data.frame(t(mat[i,])) %>%
                arrange(desc(.[1])) %>% rownames %>% head(5)
}

length(top5)
#> [1] 6

head(top5)
#> $`6`
#> [1] "ASV0004" "ASV0001" "ASV0029" "ASV0008" "ASV0026"
#> 
#> $`10a`
#> [1] "ASV0001" "ASV0007" "ASV0004" "ASV0013" "ASV0046"
#> 
#> $`10b`
#> [1] "ASV0003" "ASV0001" "ASV0007" "ASV0013" "ASV0032"
#> 
#> $`13`
#> [1] "ASV0001" "ASV0003" "ASV0004" "ASV0013" "ASV0032"
#> 
#> $`14`
#> [1] "ASV0004" "ASV0001" "ASV0008" "ASV0023" "ASV0003"
#> 
#> $`15`
#> [1] "ASV0001" "ASV0060" "ASV0004" "ASV0009" "ASV0032"

# Build genus.rb data.frame
df = psmelt(genus.rb)

# Example 1: Combine top5 from all subjects
df$Taxa1 = df$Genus
df$Taxa1[!df$OTU %in% unique(unlist(top5))] = "Others"

alluvia_data1 = df %>% group_by(SampleID, SubjectID, Sampledetails, Taxa1) %>%
        summarise(Abundance = sum(Abundance)) %>% as.data.frame
#> `summarise()` has grouped output by 'SampleID', 'SubjectID', 'Sampledetails'.
#> You can override using the `.groups` argument.

ggplot(data = alluvia_data1,
       aes(x = SampleID, y = Abundance, alluvium = Taxa1)) +
        geom_alluvium(aes(fill = Taxa1, color = Taxa1), alpha = 0.5) +
        facet_wrap(~ SubjectID, scales = "free_x") +
        scale_fill_manual("Genus", values = pals::cols25()) +
        scale_color_manual("Genus", values = pals::cols25()) +
        ggtitle("Top5 from all subjects")


# Example 2: Show top5 from subject '6'
df$Taxa2 = df$Genus
df$Taxa2[!df$OTU %in% top5[['6']]] = "Others"

alluvia_data2 = df %>% group_by(SampleID, SubjectID, Sampledetails, Taxa2) %>%
        summarise(Abundance = sum(Abundance)) %>% as.data.frame
#> `summarise()` has grouped output by 'SampleID', 'SubjectID', 'Sampledetails'.
#> You can override using the `.groups` argument.

ggplot(data = alluvia_data2,
       aes(x = SampleID, y = Abundance, alluvium = Taxa2)) +
        geom_alluvium(aes(fill = Taxa2, color = Taxa2), alpha = 0.5) +
        facet_wrap(~ SubjectID, scales = "free_x") +
        scale_fill_manual("Genus", values = pals::cols25()) +
        scale_color_manual("Genus", values = pals::cols25()) +
        ggtitle("Top5 from Subject '6'")


ggplot(data = alluvia_data2[alluvia_data2$SubjectID == "6",],
       aes(x = SampleID, y = Abundance, alluvium = Taxa2)) +
        geom_alluvium(aes(fill = Taxa2, color = Taxa2), alpha = 0.5) +
        facet_wrap(~ SubjectID, scales = "free_x") +
        scale_fill_manual("Genus", values = pals::cols25()) +
        scale_color_manual("Genus", values = pals::cols25()) +
        ggtitle("Top5 from Subject '6'")

Created on 2024-03-24 with reprex v2.1.0

pdhrati02 commented 5 months ago

Hi @ycl6, Thank you very much really for helping me with this I do understand better the steps you have used and the logic behind it. I am looking for something like the last figure in your example, and yes I will incorporate it in a loop and make it for all subjects. I just have one question: When we plot the per subject top5 and name rest as others, is it right to say that for example for the above plot of subject 6, top 5 make up for about 80% of the relative abundance but the "others" is 10-20% over time. This is others for that subject right? Thank you

ycl6 commented 5 months ago

Hi @pdhrati02 The "Top N" approache only give you the first N number of taxa that are most abundant (after the list of taxa is ordered by abundance). They can account for more than 90% of your data is dominated by the chosen taxa, or it can be way less than 80% if your data has a very diverse bacterial composition and is made up of lots of different taxa. You will need a different approach if you opted to identify most abundant taxa by a percenatge threshold.

pdhrati02 commented 5 months ago

Hi @ycl6 , thank you very much for your time and for explaining this and all the other questions too.

syaliny commented 4 months ago

Hi @ycl6 , i am doing something similar but i'd like to have this plot on the whole dataset rather than isolating the top 5 most abundant groups by a certain rank and as a barplot. I am having issues trying to scale my y axis in percent format within ggalluvial. I have calculated the relative abundance of phyla based on forest compartments. Would you be able to help me please? Code below which i have adapted from https://rpubs.com/lgschaerer/1006964 for organising the physeq object and calculating the relative abundance

phy.all <- ps_aftercontam_removal_noNTC_root_rar_p %>% tax_glom(taxrank = "Phylum") %>%
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance psmelt() %>% # Melt to long format filter(Abundance != 0) %>% mutate( Kingdom = as.character(Kingdom), Phylum = as.character(Phylum)) %>% select(Forest.comp.sort,Phylum, Abundance) %>% #choose variables to work with group_by(Forest.comp.sort) %>% #group by variables used to plot NOT taxonomic level mutate(totalSum = sum(Abundance)) %>% #calculate total abundance of each Phylum ungroup() %>% #remove grouping variables group_by(Forest.comp.sort, Phylum) %>% #group by same variables PLUS taxonomic level summarise(
Abundance = sum(Abundance), #sum abundance in each phylum for each group totalSum, RelAb = Abundance/totalSum) %>% #calculate relative abundance unique()

head(phy.all) max(phy.all$RelAb) #0.8221446

Plot

ggplot(phy.all, aes(x = Forest.comp.sort, stratum = Phylum, alluvium = Phylum, fill = Phylum, label = Phylum))+ scale_fill_manual(values = c("red", "yellow", "green3","#A50026","#E34D34","#F99858","#FDD081","#EAECCC","#B7DDED","#83B8D7","#5385BC","#364B9A","#762A83","#9970AB","#C2A5CF","#ACD39E","#5AAE61","#1B7837","aquamarine","cyan4","cyan2","darkslategrey"))+ geom_flow(stat = "alluvium", lode.guidance = "frontback", color = "darkgrey")+ geom_stratum()

alluvial_plot_png

If i add the scale_y_continuos and set the limits like this , i get the y axis as percent format but i loose the phylum colors

Plot

ggplot(phy.all, aes(x = Forest.comp.sort, stratum = Phylum, alluvium = Phylum, fill = Phylum, label = Phylum))+ scale_fill_manual(values = c("red", "yellow", "green3","#A50026","#E34D34","#F99858","#FDD081","#EAECCC","#B7DDED","#83B8D7","#5385BC","#364B9A","#762A83","#9970AB","#C2A5CF","#ACD39E","#5AAE61","#1B7837","aquamarine","cyan4","cyan2","darkslategrey"))+ geom_flow(stat = "alluvium", lode.guidance = "frontback", color = "darkgrey")+ geom_stratum()+ scale_y_continuous(limits=c(0,1), labels = scales::percent)

alluvial_plot_2_png