BiocParallel error during QCreport for maize genome

NastiaSkuba commented 6 months ago

Hi,

I am working with maize Ribo-seq data.

All genome files I have got from maizegdb: https://download.maizegdb.org/Zm-B73-REFERENCE-GRAMENE-4.0/

.gff3 file was transformed to .gtf using AGAT. (With original .gff3 file eventually I got the same results, but additionally some errors, like: Warning: gff-version directive indicates version is 3 , not 3C)

I aligned trimmed with Trim_galore! reads to the genome using ORFik:

library(ORFik)

where_to_save_config <- "../ORFik_config.csv"
fastq.dir <- file.path("../data_trimmed_2")
bam.dir <- file.path("../processed_data")
reference.dir <- file.path("../genome")
exp.dir <- file.path("../experiments")
config.save(where_to_save_config,  fastq.dir, bam.dir, reference.dir, exp.dir)

STAR.install(folder="../STAR")

get_silva_rRNA("../genome/Zea_mays_GRAMENE/")

organism <- "Zea mays"
genome <- "../genome/Zea_mays_GRAMENE/Zm-B73-REFERENCE-GRAMENE-4.0.fa"
gtf <- "../genome/Zea_mays_GRAMENE/Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2_mh.gtf"
transposons <- "../genome/Zea_mays_GRAMENE/transposons.fasta"
rRNA <- "../genome/Zea_mays_GRAMENE/rRNA_SSU_LSU_silva_138.1.fasta/SILVA_138.1_SSURef_NR99_tax_silva.fasta"
makeTxdbFromGenome(gtf, genome, organism = organism,  optimize = TRUE)

index <- STAR.index(c("gtf" = gtf, "genome" = genome, "contaminants" = transposons,
                      "rRNA" = rRNA),
                    output.dir = "../STAR_index/",
                    star.path = "../STAR/STAR-2.7.4a/bin/Linux_x86_64/STAR",
                    SAsparse = 2)

rna_alignment <- STAR.align.folder(input.dir = conf["fastq RNA-seq"],
                  output.dir = conf["bam RNA-seq"],
                  index.dir = index,
                  star.path = "../STAR/STAR-2.7.4a/bin/Linux_x86_64/STAR",
                  steps = "co-ge",
                  max.cpus = 16)

ribo_alignment <- STAR.align.folder(input.dir = conf["fastq Ribo-seq"],
                  output.dir = conf["bam Ribo-seq"],
                  index.dir = index,
                  star.path = "../STAR/STAR-2.7.4a/bin/Linux_x86_64/STAR",
                  steps = "co-ge",
                  max.cpus = 16)

Then I created corresponding experiments and trying to perform QCreport() and shiftFootprintsByExperiment()

txdb_file <- "../genome/Zea_mays_GRAMENE/Zm-B73-REFERENCE-GRAMENE-4.0_Zm00001d.2_mh.gtf.db"
fa <- "../genome/Zea_mays_GRAMENE/Zm-B73-REFERENCE-GRAMENE-4.0.fa"
create.experiment(exper = "maize_meiosis_RNA",
                  dir = paste0(conf["bam RNA-seq"], "/aligned/"),
                  saveDir = "../experiments/",
                  txdb = txdb_file,
                  fa = fa,
                  organism = "Zea mays",
                  pairedEndBam = FALSE,
                  stage = rep(c("LEP", "MII", "PACH", "PRE", "ZYG"), each=2),
                  rep = rep(c("2019", "2021"), times=5))

df_RNA <- read.experiment("maize_meiosis_RNA", in.dir = "../experiments")

create.experiment(exper = "maize_meiosis_Ribo",
                  dir = paste0(conf["bam Ribo-seq"], "/aligned/"),
                  saveDir = "../experiments/",
                  txdb = txdb_file,
                  fa = fa,
                  organism = "Zea mays",
                  pairedEndBam = FALSE,
                  stage = rep(c("LEP", "MII", "PACH", "PRE", "ZYG"), each=2),
                  rep = rep(c("2019", "2021"), times=5))

df_Ribo <- read.experiment("maize_meiosis_Ribo", in.dir = "../experiments")

QCreport(df_RNA)
QCreport(df_Ribo)

shiftFootprintsByExperiment(df_Ribo, accepted.lengths = c(27:31))

I am getting pdf files STATS_plot, cor_plot, and PCA_plot. The main problem is that afterwards I am getting a BiocParallel error, for example:

Error: BiocParallel errors
  10 remote errors, element index: 1, 2, 3, 4, 5, 6, ...
  0 unevaluated and other errors
  first remote error:
Error: BiocParallel errors
  1 remote errors, element index: 1
  2 unevaluated and other errors
  first remote error:
Error in covRleFromGR(x, weight = weight, ignore.strand = ignore.strand): Seqlengths of x contains NA values!

or

Error: BiocParallel errors
  1 remote errors, element index: 1
  9 unevaluated and other errors
  first remote error:
Error in covRleFromGR(x, weight = weight, ignore.strand = ignore.strand): Seqlengths of x contains NA values!

I already tried adding BPPARAM = SerialParam(), and reinstalling ORFik package, both did not help.

Command

seqinfo(loadTxdb(df_Ribo))
seqinfo(FaFile(df_Ribo@fafile))

returned

Seqinfo object with 108 sequences (2 circular) from an unspecified genome; no seqlengths:
  NA
Seqinfo object with 266 sequences from an unspecified genome:
  NA

	seqlengths	isCircular	genome
Mt	NA	TRUE	NA
B73V4_ctg10	NA	NA	NA
B73V4_ctg100	NA	NA	NA
B73V4_ctg102	NA	NA	NA
B73V4_ctg103	NA	NA	NA
...	...	...	...
Chr6	NA	NA	NA
Chr7	NA	NA	NA
Chr8	NA	NA	NA
Chr9	NA	NA	NA
Pt	NA	TRUE	NA

	seqlengths	isCircular	genome
Chr10	150982314	NA	NA
Chr1	307041717	NA	NA
Chr2	244442276	NA	NA
Chr3	235667834	NA	NA
Chr4	246994605	NA	NA
...	...	...	...
B73V4_ctg253	82972	NA	NA
B73V4_ctg254	69595	NA	NA
B73V4_ctg255	46945	NA	NA
B73V4_ctg2729	124116	NA	NA
Pt	140447	NA	NA

sessionInfo()

R version 4.3.1 (2023-06-16)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Berlin
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] BiocParallel_1.36.0         GenomicFeatures_1.54.4      AnnotationDbi_1.64.1        fstcore_0.9.18             
 [5] ORFik_1.23.10               GenomicAlignments_1.38.2    Rsamtools_2.18.0            Biostrings_2.70.3          
 [9] XVector_0.42.0              SummarizedExperiment_1.32.0 Biobase_2.62.0              MatrixGenerics_1.14.0      
[13] matrixStats_1.3.0           GenomicRanges_1.54.1        GenomeInfoDb_1.38.8         IRanges_2.36.0             
[17] S4Vectors_0.40.2            BiocGenerics_0.48.1        

loaded via a namespace (and not attached):
 [1] DBI_1.2.2               bitops_1.0-7            gridExtra_2.3           biomaRt_2.58.2          rlang_1.1.3            
 [6] magrittr_2.0.3          compiler_4.3.1          RSQLite_2.3.6           systemfonts_1.0.6       png_0.1-8              
[11] vctrs_0.6.5             stringr_1.5.1           pkgconfig_2.0.3         crayon_1.5.2            fastmap_1.1.1          
[16] dbplyr_2.5.0            labeling_0.4.3          biomartr_1.0.8.9000     utf8_1.2.4              rmarkdown_2.26         
[21] tzdb_0.4.0              ragg_1.3.0              purrr_1.0.2             bit_4.0.5               xfun_0.43              
[26] zlibbioc_1.48.2         cachem_1.0.8            jsonlite_1.8.8          progress_1.2.3          blob_1.2.4             
[31] DelayedArray_0.28.0     parallel_4.3.1          prettyunits_1.2.0       R6_2.5.1                stringi_1.8.3          
[36] rtracklayer_1.62.0      Rcpp_1.0.12             knitr_1.46              R.utils_2.12.3          readr_2.1.5            
[41] Matrix_1.6-5            tidyselect_1.2.1        rstudioapi_0.16.0       abind_1.4-5             yaml_2.3.8             
[46] codetools_0.2-19        curl_5.2.1              lattice_0.21-8          tibble_3.2.1            withr_3.0.0            
[51] KEGGREST_1.42.0         evaluate_0.23           BiocFileCache_2.10.2    xml2_1.3.6              pillar_1.9.0           
[56] filelock_1.0.3          generics_0.1.3          RCurl_1.98-1.14         hms_1.1.3               ggplot2_3.5.0          
[61] munsell_0.5.1           scales_1.3.0            glue_1.7.0              tools_4.3.1             BiocIO_1.12.0          
[66] data.table_1.15.4       BSgenome_1.70.2         locfit_1.5-9.9          XML_3.99-0.16.1         cowplot_1.1.3          
[71] grid_4.3.1              colorspace_2.1-0        GenomeInfoDbData_1.2.11 restfulr_0.0.15         cli_3.6.2              
[76] rappdirs_0.3.3          textshaping_0.3.7       fst_0.9.8               fansi_1.0.6             S4Arrays_1.2.1         
[81] dplyr_1.1.4             gtable_0.3.4            R.methodsS3_1.8.2       DESeq2_1.42.1           digest_0.6.35          
[86] SparseArray_1.2.4       farver_2.1.1            rjson_0.2.21            memoise_2.0.1           htmltools_0.5.8.1      
[91] R.oo_1.26.0             lifecycle_1.0.4         httr_1.4.7              bit64_4.0.5

Roleren commented 6 months ago

Very impressive work here. I think you have a malformed txdb, can you run the ORFik::makeTxdbFromGenome function with optimize as true. Then use that txdb instead, should work. Let me know if it does not.

NastiaSkuba commented 6 months ago

Thank you for your fast response! I tried the original .gff3 file, .gtf files obtained with AGATand gffread with parameter optimize=TRUE Nothing worked, seqinfo(loadTxdb(df_Ribo)) always gives the same result, and the BiocParallel error is also the same.

NastiaSkuba commented 6 months ago

[ ] https://github.com/zhpn1024/ribotish/issues/34

I realized I already had had problems with this genome, see reference. The maize genome has empty lines. Is it possible to work with such a genome in ORFik?

Roleren commented 6 months ago

Could you describe how you run txdb step above for gtf and gff3. And print the seqinfo from each of the txdbs made, I'm quite sure it is there the error is in that it has undefined seqlengths.

NastiaSkuba commented 6 months ago

Today I repeated everything with files from http://oct2017-plants.ensembl.org/info/website/ftp/index.html

But this code is representative for any file I tried and the output is the same.

organism <- "Zea mays"
genome <- "../genome/Zea_mays_Ensembl37/Zea_mays.AGPv4.dna.toplevel.fa"
gtf <- "../genome/Zea_mays_Ensembl37/Zea_mays.AGPv4.37.gtf"     # this string containing path to the file and can be changed accordingly
txdb_file <- ORFik::makeTxdbFromGenome(gtf, genome, organism = organism,  optimize = TRUE, return = TRUE)

Output:

Making txdb of GTF
Import genomic features from the file as a GRanges object ... OK
Prepare the 'metadata' data frame ... OK
Make the TxDb object ... Warning: The "phase" metadata column contains non-NA values for features of type stop_codon. This information
  was ignored.OK
--------------------------
Txdb stored at: ../genome/Zea_mays_Ensembl37/Zea_mays.AGPv4.37.gtf.db
--------------------------
Optimizing annotation, saving to: ../genome/Zea_mays_Ensembl37/ORFik_optimized
Creating fst speedup file for transcript lengths, at location:
../genome/Zea_mays_Ensembl37/ORFik_optimized/Zea_mays.AGPv4.37_2024-05-08094034+0200_txLengths.fst
Creating rds speedup files for transcript regions

Results of seqinfo(loadTxdb(txdb_file)):

Seqinfo object with 121 sequences (2 circular) from an unspecified genome; no seqlengths:
  NA

seqlengths isCircular genome 1 NA NA NA
2 NA NA NA
3 NA NA NA
4 NA NA NA
5 NA NA NA
... ... ... ... B73V4_ctg92 NA NA NA
B73V4_ctg95 NA NA NA
B73V4_ctg97 NA NA NA
B73V4_ctg98 NA NA NA
Pt NA TRUE NA

With any version of MaizeGDB files, I used before I am getting the same seqinfo() result:

Seqinfo object with 108 sequences (2 circular) from an unspecified genome; no seqlengths:
  NA

seqlengths isCircular genome Mt NA TRUE NA
B73V4_ctg10 NA NA NA
B73V4_ctg100 NA NA NA
B73V4_ctg102 NA NA NA
B73V4_ctg103 NA NA NA
... ... ... ... Chr6 NA NA NA
Chr7 NA NA NA
Chr8 NA NA NA
Chr9 NA NA NA
Pt NA TRUE NA

Only exception is .gff3 format, where I additionally got the next table:

seqid start end strand ID Name Parent Parent_type 1 Chr1 572435 572663 + NA Zm00001d022649_T001.exon1 transcript:Zm00001d022649_T001 NA 2 Chr1 1030351 1031090 + NA Zm00001d022650_T001.exon1 transcript:Zm00001d022650_T001 NA 3 Chr1 1540407 1540586 - NA Zm00001d022651_T001.exon1 transcript:Zm00001d022651_T001 NA 4 Chr1 1918444 1918644 - NA Zm00001d022652_T001.exon1 transcript:Zm00001d022652_T001 NA 5 Chr1 2170280 2170492 - NA Zm00001d022653_T001.exon2 transcript:Zm00001d022653_T001 NA 6 Chr1 2170757 2170983 - NA Zm00001d022653_T001.exon1 transcript:Zm00001d022653_T001 NA

Also I found the BioStars with exactly the same issue: https://www.biostars.org/p/9552773/ Was the problem solved in that case?

Another possibly related issue, after running detectRibosomeShifts() function, I immediately received the error:

 Error in loadTxdb(txdb) : txdb must be path, list or TxDb

Roleren commented 6 months ago

Hm, that is not what I get:

library(ORFik)
> annotation <- getGenomeAndAnnotation("Zea mays", output.dir = file.path(config()["ref"], "zea_mays/"), optimize = TRUE)
Loading premade Genome files, do remake = TRUE if you want to run again
> seqinfo(loadTxdb(paste0(annotation["gtf"], ".db")))
Seqinfo object with 685 sequences from an unspecified genome:
  seqnames seqlengths isCircular genome
  1         308452471       <NA>   <NA>
  2         243675191       <NA>   <NA>
  3         238017767       <NA>   <NA>
  4         250330460       <NA>   <NA>
  5         226353449       <NA>   <NA>
  ...             ...        ...    ...
  scaf_691      31489       <NA>   <NA>
  scaf_692      31211       <NA>   <NA>
  scaf_693      30818       <NA>   <NA>
  scaf_694      30512       <NA>   <NA>
  scaf_695      30084       <NA>   <NA>

> anyNA(seqlengths(seqinfo(loadTxdb(paste0(annotation["gtf"], ".db")))))
[1] FALSE

Reference assembly I get is: Zea_mays.Zm-B73-REFERENCE-NAM-5.0.56

NastiaSkuba commented 6 months ago

After R update and all the packages reinstallation, I have got the function working!

So currently I am using R=4.3.3 at Mac M2 laptop. Unfortunately, I could not run STAR there, because ORFik did not detect fastp installation for this processor. Nevertheless, I can execute alignment on Ubuntu machine. But there the annotation is still malformed. Thank you for your help!

Roleren / ORFik

BiocParallel error during QCreport for maize genome #171