Importing atypical data

Hi @lauramason326

Below I am using the demo dataset GlobalPatterns to first construct a data.frame similar to your data structure. You can adapt the rest of the code to your real data.

library(phyloseq)
library(tidyverse)
library(stringr) # str_pad

data(GlobalPatterns)
df <- GlobalPatterns %>% psmelt()
# Think of OTU as the sequence info in your data
df <- df[,c("Sample","OTU","Abundance","Kingdom","Phylum","Class","Order","Family","Genus","Species")]

> head(df, 10)
         Sample    OTU Abundance  Kingdom         Phylum               Class           Order          Family          Genus                   Species
406582   AQC4cm 549656   1177685 Bacteria  Cyanobacteria         Chloroplast   Stramenopiles            <NA>           <NA>                      <NA>
241435 LMEpi24M 279599    914209 Bacteria  Cyanobacteria    Nostocophycideae      Nostocales     Nostocaceae Dolichospermum                      <NA>
406580   AQC7cm 549656    711043 Bacteria  Cyanobacteria         Chloroplast   Stramenopiles            <NA>           <NA>                      <NA>
406574   AQC1cm 549656    554198 Bacteria  Cyanobacteria         Chloroplast   Stramenopiles            <NA>           <NA>                      <NA>
329873  M31Tong 360229    540850 Bacteria Proteobacteria  Betaproteobacteria    Neisseriales   Neisseriaceae      Neisseria                      <NA>
300794  M11Fcsw 331820    452219 Bacteria  Bacteroidetes         Bacteroidia   Bacteroidales  Bacteroidaceae    Bacteroides                      <NA>
494797  M31Tong  94166    396201 Bacteria Proteobacteria Gammaproteobacteria  Pasteurellales Pasteurellaceae    Haemophilus Haemophilusparainfluenzae
300772  M31Fcsw 331820    354695 Bacteria  Bacteroidetes         Bacteroidia   Bacteroidales  Bacteroidaceae    Bacteroides                      <NA>
298689 SLEpi20M 329744    323914 Bacteria Actinobacteria      Actinobacteria Actinomycetales          ACK-M1           <NA>                      <NA>
114279     TS29 189047    251215 Bacteria     Firmicutes          Clostridia   Clostridiales Ruminococcaceae           <NA>                      <NA>

Below I convert Sample and OTU to factor, rename OTU ID to ASV_xxxxx and sort the data.frame.

df$Sample <- as.factor(df$Sample)
df$OTU <- as.factor(df$OTU)
levels(df$OTU) <- paste0("ASV_", str_pad(1:nlevels(df$OTU), nchar(nlevels(df$OTU)), pad = "0"))
df <- df[order(df$Sample, df$OTU),]

> head(df, 10)
    Sample       OTU Abundance  Kingdom         Phylum               Class             Order            Family             Genus Species
11  AQC1cm ASV_00001         0 Bacteria Actinobacteria      Actinobacteria   Actinomycetales Microbacteriaceae          Cryocola    <NA>
36  AQC1cm ASV_00002         0 Bacteria Proteobacteria Alphaproteobacteria  Sphingomonadales Sphingomonadaceae              <NA>    <NA>
65  AQC1cm ASV_00003         0 Bacteria Proteobacteria Deltaproteobacteria Desulfobacterales  Desulfobulbaceae              <NA>    <NA>
88  AQC1cm ASV_00004         1 Bacteria Proteobacteria Gammaproteobacteria     Thiotrichales              <NA>              <NA>    <NA>
128 AQC1cm ASV_00005         0 Bacteria Proteobacteria Gammaproteobacteria     Thiotrichales    Thiotrichaceae         Thiothrix    <NA>
144 AQC1cm ASV_00006         0 Bacteria     Firmicutes          Clostridia     Clostridiales   Lachnospiraceae      Ruminococcus    <NA>
165 AQC1cm ASV_00007         1 Bacteria Proteobacteria  Betaproteobacteria     Rhodocyclales    Rhodocyclaceae Methyloversatilis    <NA>
198 AQC1cm ASV_00008         1 Bacteria  Bacteroidetes       Flavobacteria              <NA>              <NA>              <NA>    <NA>
230 AQC1cm ASV_00009         1 Bacteria Proteobacteria  Betaproteobacteria   Burkholderiales    Comamonadaceae              <NA>    <NA>
246 AQC1cm ASV_00010        11 Bacteria Proteobacteria  Betaproteobacteria   Burkholderiales  Burkholderiaceae       Cupriavidus    <NA>

They we build the required data for phyloseq. In sam, you can add other sample variables you have to the data.frame at this point.

# sample_data
sam <- data.frame(SampleID = levels(df$Sample), row.names = levels(df$Sample))

# tax_table
tax <- unique(df[,c("OTU","Kingdom","Phylum","Class","Order","Family","Genus","Species")])
rownames(tax) <- tax$OTU
tax <- as.matrix(tax[,-1]) # remove OTU column

# otu_table
otu <- df %>% select(OTU, Sample, Abundance) %>% spread(Sample, Abundance)
rownames(otu) <- otu$OTU
otu <- otu[,-1] # remove OTU column

# Check rownames and colnames matched
all(rownames(otu) == rownames(tax_table(tax))) # TRUE
all(colnames(otu) == rownames(sample_data(sam))) # TRUE

ps <- phyloseq(otu_table(otu, taxa_are_rows = TRUE), tax_table(tax), sample_data(sam))

> head(sam)
       SampleID
AQC1cm   AQC1cm
AQC4cm   AQC4cm
AQC7cm   AQC7cm
CC1         CC1
CL3         CL3
Even1     Even1

> head(tax)
          Kingdom    Phylum           Class                 Order               Family              Genus          Species
ASV_00001 "Bacteria" "Actinobacteria" "Actinobacteria"      "Actinomycetales"   "Microbacteriaceae" "Cryocola"     NA     
ASV_00002 "Bacteria" "Proteobacteria" "Alphaproteobacteria" "Sphingomonadales"  "Sphingomonadaceae" NA             NA     
ASV_00003 "Bacteria" "Proteobacteria" "Deltaproteobacteria" "Desulfobacterales" "Desulfobulbaceae"  NA             NA     
ASV_00004 "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Thiotrichales"     NA                  NA             NA     
ASV_00005 "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Thiotrichales"     "Thiotrichaceae"    "Thiothrix"    NA     
ASV_00006 "Bacteria" "Firmicutes"     "Clostridia"          "Clostridiales"     "Lachnospiraceae"   "Ruminococcus" NA     

> head(otu)
          AQC1cm AQC4cm AQC7cm CC1 CL3 Even1 Even2 Even3 F21Plmr LMEpi24M M11Fcsw M11Plmr M11Tong M31Fcsw M31Plmr M31Tong NP2 NP3
ASV_00001      0      0      0   0   0     0     0     0       0        0       0       0       0       0       0       0   0   0
ASV_00002      0      0      0   0   0     0     0     0       0        0       0       0       0       0       0       0   0   0
ASV_00003      0      3      1   0   0     0     0     1       0        0       0       0       0       0       0       0   0   4
ASV_00004      1      7      1 136   5     0     1     0       0        0       0       0       0       0       0       0   0   0
ASV_00005      0      0      0   0   0     0     1     0       0        0       0       2       0       0       0       0   0   0
ASV_00006      0      0      0   0   0     0     2     1       0        0       0       0       0      58       0       0   0   0
          NP5 SLEpi20M SV1 TRRsed1 TRRsed2 TRRsed3 TS28 TS29
ASV_00001   0        0   7       0       0       0    0    0
ASV_00002   0        0   7       0       0       0    0    0
ASV_00003   0        0   0       6      71      44    0    0
ASV_00004   0        0   0       2       2       0    0    0
ASV_00005   0        0   0       0       0       0    0    0
ASV_00006   0        0   0       0       0       0    1    0

> ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 19216 taxa and 26 samples ]
sample_data() Sample Data:       [ 26 samples by 1 sample variables ]
tax_table()   Taxonomy Table:    [ 19216 taxa by 7 taxonomic ranks ]

joey711 / phyloseq

Importing atypical data #1592