metagenlab / zAMP

zAMP is a bioinformatic pipeline designed for convenient, reproducible and scalable amplicon-based metagenomics
https://zamp.readthedocs.io/en/latest/
MIT License
8 stars 5 forks source link

Generate a sequence length plot with assignment for QC purpose #5

Open valscherz opened 5 years ago

valscherz commented 5 years ago

The length filtering is arbitrary: it would be interesting to have a plot showing the length disperstion with the taxonomic assignement. It would allow for instance to see if we could exlcue more sequence based on their length. I had made one previously (draft of code hereunder) but not adapted to the current output of the pipeline.

Dada2 sequences

Filter to have the sequences of all unassigned features to blast them.

Blast commands in the ipynb in the same folder


## Obtain  summary by otu of the frequencey

### read the table created with the barplot function where we kept all unassigned features of all samples
all_unassigned_table <- read.table("r_figures/quantitative_barplots/Absolute/Absolute/Table/Kingdom_Unclassified_Absolute>20_study_name_noyes_CURML_sample_source_abundancy_table.tsv", sep = "\t", header = T)

### Use summarise to have only once each OTU and the sum of the abundance for each
summarized_unassiged_table <- all_unassigned_table %>%
  group_by(OTU) %>%
  summarise(sum_abudance = sum(Abundance))

### Filter to keep only the ones with  more than 20 reads, to reduce the number of sequences to blast
filtered_unassigned_table <- summarized_unassiged_table %>%
  filter(sum_abudance >= 20)

### Read the fasta file of all sequences denoised by dada
all_fasta_seq <- read_fasta(file_path = "1_dada_denoised/dna-sequences.fasta")

### Transform the rownames as first column, needed to join with deplyr
all_fasta_seq <- as.data.frame(all_fasta_seq)
all_fasta_seq <- rownames_to_column(all_fasta_seq, var = "OTU")

### Keep only the sequences in the filtered table of all unassigned sequences
filtered_seq <- semi_join(all_fasta_seq, filtered_unassigned_table, by = c("OTU"))

### Write in into a fasta format table
df.fasta = dataframe2fas(filtered_seq, file="filtered_seq.fasta")

Create a function to have the length of the sequences


## Rename the sequence column in the all_fasta_seq table
names(all_fasta_seq)[2]<-paste("sequences")

## Calculate the sequences length
sequence_length <- all_fasta_seq %>%
  mutate(length = nchar(as.character(all_fasta_seq$sequences)))

## Get a table with the overall abundance of all features et their assignment
all_sequences_counts <- physeq_df %>%
  group_by(OTU, Kingdom) %>%
  summarise(sum_abundance = sum(Abundance))

## Join the two just created table with all sequences and abundance
joined_sequences_tables <- full_join(all_sequences_counts, sequence_length, by = "OTU")

## Join by length and by kingdom retaining the sum of the abundance for plotting
joined_sequences_tables_length_sum <- joined_sequences_tables %>%
  group_by(length, Kingdom) %>%
  summarise(length_sum_abundance = sum(sum_abundance))

## Create a version without the 0
joined_sequences_no_zero <- joined_sequences_tables_length_sum %>%
  ungroup() %>%
  filter(length_sum_abundance > 0)
  ### Write it as tsv
  write.table(x = joined_sequences_no_zero, file = "length_of_sequences.tsv", append = F, quote = F, sep = "\t", row.names = F, col.names = TRUE)

## Plot the size by Kingdom
read_length_plot <- joined_sequences_tables_length_sum %>%
  ggplot(aes(x = length, y = length_sum_abundance, fill = Kingdom)) +
  geom_col(position = "stack", width = 1) 

##Save this plots 
ggsave(read_length_plot, filename = "amplicons_length_distribution.png", width = 10, height = 5)

## Zoom to lower < values for better visualization

read_length_plot_zoomed <- read_length_plot + coord_cartesian(
xlim = c(250, 450), ylim = c(0, 50000))

##Save this plots 
ggsave(read_length_plot_zoomed, filename = "amplicons_length_distribution_zoomed.png", width = 10, height = 5)

To compare sequences length and results, same analysis with vsearch join paired sequences

Load objects into R


## Function to load in R environnement the taxonomy table, the metadata table, the features counts table, the tree, the Shannon and the PCO objects

load_objects_fct <- function(taxonomy_class_table, replace_empty_tax = TRUE, features_counts_table , metadata_table, tree, shannon, pco) {

  ## Qiime2 .qza features counts table

    ### Load Qiime2 .qza counts table into R environment
    features_counts_table<-read_qza(features_counts_table)

    ### Shows the unique identifier of the file
    print("features counts table")
    print(features_counts_table$uuid)

    ### Print information about the object
    names(features_counts_table)

    ### Shows the unique identifier of the file
    print("features count table")
    print(features_counts_table$uuid)

    ### Show the first 5 samples and features
    print(features_counts_table$data[1:5,1:5]) 

    ### Save this table in global environnement
    features_counts_table <<- features_counts_table

}

## Run the function to load all needed object into R

load_objects_fct( features_counts_table =  "1b_vsearch_joining/dereplicate/dereplicated_table.qza")

## Create a phyloseq object from what has been loaded in the previous chunks

  ### Generate a function to create a phyloseq object
  create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq_df", otu_table){

      ### Create the phyloseq object
      physeq_obj<-phyloseq(
      otu_table(otu_table$data, taxa_are_rows = T))

      ### Assign the object into the environnement 
      assign(value = physeq_obj, x = physeq_name, envir =  .GlobalEnv)

      ### Melt the physeq objet into a dataframe with one row per feature.id and per sample, needed later
      physeq_df <- psmelt(physeq_obj)

      ### Assign the object into the environnement 
      assign(value = physeq_df, x = melted_df_name, envir =  .GlobalEnv)

    }

  ### Use the function
  create_phyloseq_fct(physeq_name = "physeq_vsearch", melted_df_name = "physeq_df_vsearch", otu_table = features_counts_table)

Filter to have the sequences of all unassigned features


### Read the fasta file of all sequences denoised by dada
all_fasta_seq_vsearch <- read_fasta(file_path = "1b_vsearch_joining/dereplicate/dna-sequences.fasta")

### Transform the rownames as first column, needed to join with deplyr
all_fasta_seq_vsearch <- as.data.frame(all_fasta_seq_vsearch)
all_fasta_seq_vsearch <- rownames_to_column(all_fasta_seq_vsearch, var = "OTU")

## Rename the sequence column in the all_fasta_seq table
names(all_fasta_seq_vsearch)[2]<-paste("sequences")

## Separate the first column where the feature ID and the sample name are merged
all_fasta_seq_vsearch <- separate(all_fasta_seq_vsearch, sep = " ", col = OTU, into = "OTU")

Create a function to have the length of the sequences


## Calculate the sequences length
sequence_length_vsearch <- all_fasta_seq_vsearch %>%
  mutate(length = nchar(as.character(all_fasta_seq_vsearch$sequences)))

## Get a table with the overall abundance of all features et their assignment
all_sequences_counts_vsearch <- physeq_df_vsearch %>%
  group_by(OTU) %>%
  summarise(sum_abundance = sum(Abundance))

## Join the two just created table with all sequences and abundance
joined_sequences_tables_vsearch <- full_join(all_sequences_counts_vsearch, sequence_length_vsearch, by = "OTU")

### Write it as tsv
  write.table(x = joined_sequences_tables_vsearch, file = "length_of_sequences_vsearch.tsv", append = F, quote = F, sep = "\t", row.names = F, col.names = TRUE)

## Join by length and by kingdom retaining the sum of the abundance for plotting
joined_sequences_tables_length_sum_vsearch <- joined_sequences_tables_vsearch %>%
  group_by(length) %>%
  summarise(length_sum_abundance = sum(sum_abundance))

## Plot the size by Kingdom
read_length_plot <- joined_sequences_tables_length_sum_vsearch %>%
  ggplot(aes(x = length, y = length_sum_abundance)) +
  geom_col(position = "stack", width = 1) 

##Save this plots 
ggsave(read_length_plot, filename = "amplicons_length_distribution_vsearch.png", width = 10, height = 5)

## Zoom to lower < values for better visualization

read_length_plot_zoomed <- read_length_plot + coord_cartesian(
xlim = c(0, 600), ylim = c(0, 50000))

##Save this plots 
ggsave(read_length_plot_zoomed, filename = "amplicons_length_distribution_zoomed_vsearch.png", width = 10, height = 5)