error in reassigning levels after merge_samples

I believe that my issue is related to threads elsewhere, but I am unable to resolve my problem independently, so I am seeking additional guidance. Please let me know if I have provided enough information about my issue.

My goal is to merge samples of the same level in one factor while retaining the integrity of levels on other factors so that I can do differential analyses on those factors.

In my attempts to do this using merge_samples(), I think I have created some messes. In short, I appear to be having a problem with levels of factors in my “sample_data”, similar to (https://github.com/joey711/phyloseq/issues/243).

To begin: (1) Null Values in sample_data?


> #Add sample data to the dataset using merge
>   bmsd <- import_qiime_sample_data(map_file)
>   class(bmsd)
[1] "sample_data"
attr(,"package")
[1] "phyloseq"
>   #Check the dimensions of your mapping file
>   dim(bmsd)
[1] 104  22

When I view the first few lines of the import, things look OK:

>head(sample_data(Piglet_Data), 3)
Sample Data:        [3 samples by 22 sample variables]:
X.SampleID BarcodeSequence LinkerPrimerSequence ReverseBarcode
S13.0695   S13.0695            AGTA AGAGTTTGATCCTGGCTCAG          GCAGT
S13.1013   S13.1013            GTCT AGAGTTTGATCCTGGCTCAG          AGTGA
S13.1051   S13.1051            GTCT AGAGTTTGATCCTGGCTCAG        AGCATGT
             ReversePrimer PigletID  Date LocationCode        Location
S13.0695 ATTACCGCGGCTGCTGG     P407 41517    AMicroDM1 small.intestine
S13.1013 ATTACCGCGGCTGCTGG     P409 41525    AMicroIM1 small.intestine
S13.1051 ATTACCGCGGCTGCTGG     P409 41525    AMicroCL4 large.intestine
                 Location.Depth Site      Sub       Sub.Depth  Depth Age AgeRange PCR
S13.0695 small.intestine.mucosa    1 duodenum duodenum.mucosa mucosa   2      1.2   1
S13.1013 small.intestine.mucosa    7    ileum    ileum.mucosa mucosa  10      9.1   0
S13.1051  large.intestine.lumen   13    colon     colon.lumen  lumen  10      9.1   4
         RepeatPCR RepeatPCRValue CONCngul X10ngCONC DILUTED
S13.0695       yes              1     1.33      7.52      no
S13.1013       yes           null     1.28      7.81      no
S13.1051        no           null     8.28      1.21     yes

However, when I specifically look for the levels, some columns do not appear to have levels assigned $Date, $Site, $Age, $AgeRange:

#Are levels on sample variables correct?
>   levels(sample_data(Piglet_Data)$X.SampleID)
  [1] "S13.0695" "S13.0697" "S13.0699" "S13.0701" "S13.0703" "S13.0705" "S13.0707"
  [8] "S13.0709" "S13.0711" "S13.0713" "S13.0715" "S13.0717" "S13.0719" "S13.0721"
 [15] "S13.0723" "S13.0725" "S13.0727" "S13.0729" "S13.0731" "S13.0733" "S13.0735"
 [22] "S13.0737" "S13.0739" "S13.0741" "S13.0743" "S13.0745" "S13.1001" "S13.1003"
 [29] "S13.1005" "S13.1007" "S13.1009" "S13.1011" "S13.1013" "S13.1015" "S13.1017"
 [36] "S13.1019" "S13.1021" "S13.1023" "S13.1025" "S13.1027" "S13.1029" "S13.1031"
 [43] "S13.1033" "S13.1035" "S13.1037" "S13.1039" "S13.1041" "S13.1043" "S13.1045"
 [50] "S13.1047" "S13.1049" "S13.1051" "S13.1670" "S13.1672" "S13.1674" "S13.1676"
 [57] "S13.1678" "S13.1680" "S13.1682" "S13.1684" "S13.1686" "S13.1688" "S13.1690"
 [64] "S13.1692" "S13.1694" "S13.1696" "S13.1698" "S13.1700" "S13.1702" "S13.1704"
 [71] "S13.1706" "S13.1708" "S13.1710" "S13.1712" "S13.1714" "S13.1716" "S13.1718"
 [78] "S13.1720" "S13.2004" "S13.2006" "S13.2008" "S13.2010" "S13.2012" "S13.2014"
 [85] "S13.2016" "S13.2018" "S13.2020" "S13.2022" "S13.2024" "S13.2026" "S13.2028"
 [92] "S13.2030" "S13.2032" "S13.2034" "S13.2036" "S13.2038" "S13.2040" "S13.2042"
 [99] "S13.2044" "S13.2046" "S13.2048" "S13.2050" "S13.2052" "S13.2054"
>   levels(sample_data(Piglet_Data)$BarcodeSequence)
[1] "AGTA"  "CTGAT" "GTACG" "GTCT"  "TCAAG" "TCAT" 
>   levels(sample_data(Piglet_Data)$LinkerPrimerSequence)
[1] "AGAGTTTGATCCTGGCTCAG"
>   levels(sample_data(Piglet_Data)$ReverseBarcode)
 [1] "AAGC"     "ACGAGTGC" "AGCATGT"  "AGTGA"    "CACTGT"   "CAGAGCT"  "CATGCG"  
 [8] "CTAC"     "GACCACTT" "GACTGT"   "GCAGT"    "GTACATC"  "GTAGTG"   "GTCAGCAT"
[15] "TACC"     "TAGCT"    "TAGTCACG" "TATCGTG" 
>   levels(sample_data(Piglet_Data)$ReversePrimer)
[1] "ATTACCGCGGCTGCTGG"
>   levels(sample_data(Piglet_Data)$PigletID)
[1] "P407" "P409" "P413" "P415"
>   levels(sample_data(Piglet_Data)$Date)
NULL
>   levels(sample_data(Piglet_Data)$LocationCode)
 [1] "AMicroCL1" "AMicroCL2" "AMicroCL3" "AMicroCL4" "AMicroCM1" "AMicroCM2"
 [7] "AMicroCM3" "AMicroCM4" "AMicroDL1" "AMicroDL2" "AMicroDL3" "AMicroDM1"
[13] "AMicroDM2" "AMicroDM3" "AMicroIL1" "AMicroIL2" "AMicroIL3" "AMicroIM1"
[19] "AMicroIM2" "AMicroIM3" "AMicroJL1" "AMicroJL2" "AMicroJL3" "AMicroJM1"
[25] "AMicroJM2" "AMicroJM3"
>   levels(sample_data(Piglet_Data)$Location)
[1] "large.intestine" "small.intestine"
>   levels(sample_data(Piglet_Data)$Location.Depth)
[1] "large.intestine.lumen"  "large.intestine.mucosa" "small.intestine.lumen" 
[4] "small.intestine.mucosa"
>   levels(sample_data(Piglet_Data)$Site)
NULL
>   levels(sample_data(Piglet_Data)$Sub)
[1] "colon"    "duodenum" "ileum"    "jejunum" 
>   levels(sample_data(Piglet_Data)$Sub.Depth)
[1] "colon.lumen"     "colon.mucosa"    "duodenum.lumen"  "duodenum.mucosa"
[5] "ileum.lumen"     "ileum.mucosa"    "jejunum.lumen"   "jejunum.mucosa" 
>   levels(sample_data(Piglet_Data)$Depth)
[1] "lumen"  "mucosa"
>   levels(sample_data(Piglet_Data)$Age)
NULL
>   levels(sample_data(Piglet_Data)$AgeRange)
NULL''''

However, when I use factor to look at the values in $Age, levels are listed.

> head(factor(sample_data(Piglet_Data)$Age))
[1] 2  10 10 1  1  1 
Levels: 1 2 9 10

Why would they not also be in output for levels(sample_data(Piglet_Data)$Age)?

(2) I assume that the NULL values in the $Site factor are why this command is failing when I try to merge by $Site (since there are no levels being detected?):

> merged = merge_samples(Piglet_Data_RF, "Site")
{Error in validObject(.Object) : invalid class “phyloseq” object: 
 Component sample names do not match.
 Try sample_names()

(3) An issue similar to (https://github.com/joey711/phyloseq/issues/243), but my lack of experience is such that I am unable to take the solution from (https://github.com/joey711/phyloseq/issues/243) and apply it to my own work at the moment. I followed http://joey711.github.io/phyloseq-demo/Restroom-Biogeography to merge these data by a factor that appear to have levels in sample_data detected ($LocationCode). merged = merge_samples(Piglet_Data_RF, "LocationCode") But when I look at the first few lines of the sample_data, I noticed there are several changes to the variables:

head(sample_data(merged), 3)
Sample Data:        [3 samples by 22 sample variables]:
          X.SampleID BarcodeSequence LinkerPrimerSequence ReverseBarcode
AMicroCL1   60.50000        4.500000                    1        6.50000
AMicroCL2   65.33333        4.666667                    1       10.33333
AMicroCL3   62.25000        3.750000                    1        6.25000
          ReversePrimer PigletID     Date LocationCode Location Location.Depth Site
AMicroCL1             1 2.500000 41563.00            1        1              1   10
AMicroCL2             1 2.666667 41575.67            2        1              1   11
AMicroCL3             1 2.500000 41563.00            3        1              1   12
          Sub Sub.Depth Depth Age AgeRange      PCR RepeatPCR RepeatPCRValue CONCngul
AMicroCL1   1         1     1 5.5 5.150000 3.250000      1.25           5.50 8.395000
AMicroCL2   1         1     1 4.0 3.833333 4.333333      1.00           6.00 8.743333
AMicroCL3   1         1     1 5.5 5.150000 3.750000      1.25           5.25 6.630000
          X10ngCONC  DILUTED
AMicroCL1  1.372500 1.250000
AMicroCL2  1.173333 1.666667
AMicroCL3  2.870000 1.500000

The unmerged data look like:


> head(sample_data(Piglet_Data_RF), 3)

Sample Data:        [3 samples by 22 sample variables]:
         X.SampleID BarcodeSequence LinkerPrimerSequence ReverseBarcode
S13.0695   S13.0695            AGTA AGAGTTTGATCCTGGCTCAG          GCAGT
S13.1013   S13.1013            GTCT AGAGTTTGATCCTGGCTCAG          AGTGA
S13.1051   S13.1051            GTCT AGAGTTTGATCCTGGCTCAG        AGCATGT
             ReversePrimer PigletID  Date LocationCode        Location
S13.0695 ATTACCGCGGCTGCTGG     P407 41517    AMicroDM1 small.intestine
S13.1013 ATTACCGCGGCTGCTGG     P409 41525    AMicroIM1 small.intestine
S13.1051 ATTACCGCGGCTGCTGG     P409 41525    AMicroCL4 large.intestine
                 Location.Depth Site      Sub       Sub.Depth  Depth Age AgeRange PCR
S13.0695 small.intestine.mucosa    1 duodenum duodenum.mucosa mucosa   2      1.2   1
S13.1013 small.intestine.mucosa    7    ileum    ileum.mucosa mucosa  10      9.1   0
S13.1051  large.intestine.lumen   13    colon     colon.lumen  lumen  10      9.1   4
         RepeatPCR RepeatPCRValue CONCngul X10ngCONC DILUTED
S13.0695       yes              1     1.33      7.52      no
S13.1013       yes           null     1.28      7.81      no
S13.1051        no           null     8.28      1.21     yes

I am able to successfully assign the levels on the factor from the original data to the new data, but when I try to do that for the other factors that got changed, I created a new problem:

head(sample_data(merged), 3)
Sample Data:        [3 samples by 22 sample variables]:

> sample_data(pigletSite)$Site <- levels(sample_data(Piglet_Data_RF)$Site)

> head(sample_data(pigletSite), 3)
Sample Data:        [3 samples by 21 sample variables]:

It got rid of the factor that I tried to relabel (21 variables now instead of 22).

(4) The final issue that I have is that while I am able to relabel a couple of factors, there are other factors that cannot be relabeled:

> sample_data(merged)$Sub <- levels(sample_data(Piglet_Data_RF)$Sub)
Error in `$<-.data.frame`(structure(x@.Data, names = x@names, row.names = x@row.names,  : 
  replacement has 4 rows, data has 26

Any insight or suggestions would be appreciated.

joey711 / phyloseq

error in reassigning levels after merge_samples #386