YuLab-SMU / ChIPseeker

:dart: ChIP peak Annotation, Comparison and Visualization
https://onlinelibrary.wiley.com/share/author/GYJGUBYCTRMYJFN2JFZZ?target=10.1002/cpz1.585
229 stars 75 forks source link

sameStrand=TRUE in annotatePeak not working properly #58

Open balwierz opened 7 years ago

balwierz commented 7 years ago

steps to reproduce:

TxDb object construction: library(ChIPseeker) library(AnnotationDbi) library(GenomicFeatures) TxDb.Drerio.ucsc.geneid = makeTxDbFromUCSC(genome = "danRer7", tablename = "refGene")

annotation of one genomic range on the "+" strand: as.data.frame(annotatePeak(GRanges("chr25", IRanges(37328949, 37329249), strand="+"), TxDb=TxDb.Drerio.ucsc.geneid, genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon","Downstream", "Intergenic"), sameStrand=TRUE))

Yields:

  seqnames    start      end width strand       annotation geneChr geneStart  geneEnd geneLength geneStrand geneId transcriptId distanceToTSS
1    chr25 37328949 37329249   301      + Promoter (<=1kb)      25  37324380 37328953       4574          2 559470 NM_001089523             0

Which is wrong, because this transcript is on negative strand. transcripts(TxDb.Drerio.ucsc.geneid)[transcripts(TxDb.Drerio.ucsc.geneid)$tx_name == "NM_001089523"]

GRanges object with 1 range and 2 metadata columns:
      seqnames               ranges strand |     tx_id      tx_name
         <Rle>            <IRanges>  <Rle> | <integer>  <character>
  [1]    chr25 [37324380, 37328953]      - |     16775 NM_001089523

Moreover, the region overlaps a transcript on "+" strand: NM_001270554.1, which actually should be assigned instead of the one above. transcripts(TxDb.Drerio.ucsc.geneid)[transcripts(TxDb.Drerio.ucsc.geneid)$tx_name == "NM_001270554"]

GRanges object with 1 range and 2 metadata columns:
      seqnames               ranges strand |     tx_id      tx_name
         <Rle>            <IRanges>  <Rle> | <integer>  <character>
  [1]    chr25 [37329100, 37346650]      + |     16547 NM_001270554

From ?annotatePeak:

sameStrand: logical, whether find nearest/overlap gene in the same strand

Details:

rvcheck::check_bioc("ChIPseeker")
devel branch is used, this function only works for release version...
$package
[1] "ChIPseeker"

$installed_version
[1] "1.13.1"

$latest_version
[1] "1.12.0"

$up_to_date
[1] NA

sessionInfo()
R version 3.4.0 (2017-04-21)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 9 (stretch)

Matrix products: default
BLAS: /usr/lib/openblas-base/libblas.so.3
LAPACK: /usr/lib/libopenblasp-r0.2.19.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_DK.UTF-8        LC_COLLATE=C              
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
[1] GenomicFeatures_1.28.4 GenomicRanges_1.28.3   GenomeInfoDb_1.12.2   
[4] AnnotationDbi_1.38.1   IRanges_2.10.2         S4Vectors_0.14.3      
[7] Biobase_2.36.2         BiocGenerics_0.22.0    ChIPseeker_1.13.1     

loaded via a namespace (and not attached):
 [1] bit64_0.9-7                            
 [2] splines_3.4.0                          
 [3] gtools_3.5.0                           
 [4] assertthat_0.2.0                       
 [5] DO.db_2.9                              
 [6] rvcheck_0.0.8                          
 [7] blob_1.1.0                             
 [8] GenomeInfoDbData_0.99.0                
 [9] Rsamtools_1.28.0                       
[10] RSQLite_2.0                            
[11] lattice_0.20-35                        
[12] glue_1.1.1                             
[13] digest_0.6.12                          
[14] RColorBrewer_1.1-2                     
[15] XVector_0.16.0                         
[16] qvalue_2.8.0                           
[17] colorspace_1.3-2                       
[18] Matrix_1.2-10                          
[19] plyr_1.8.4                             
[20] XML_3.98-1.9                           
[21] pkgconfig_2.0.1                        
[22] biomaRt_2.32.1                         
[23] zlibbioc_1.22.0                        
[24] GO.db_3.4.1                            
[25] scales_0.4.1                           
[26] gdata_2.18.0                           
[27] BiocParallel_1.10.1                    
[28] tibble_1.3.3                           
[29] ggplot2_2.2.1                          
[30] UpSetR_1.3.3                           
[31] SummarizedExperiment_1.6.3             
[32] lazyeval_0.2.0                         
[33] magrittr_1.5                           
[34] memoise_1.1.0                          
[35] DOSE_3.2.0                             
[36] gplots_3.0.1                           
[37] tools_3.4.0                            
[38] data.table_1.10.4                      
[39] gridBase_0.4-7                         
[40] matrixStats_0.52.2                     
[41] stringr_1.2.0                          
[42] munsell_0.4.3                          
[43] plotrix_3.6-5                          
[44] DelayedArray_0.2.7                     
[45] bindrcpp_0.2                           
[46] Biostrings_2.44.1                      
[47] compiler_3.4.0                         
[48] caTools_1.17.1                         
[49] rlang_0.1.1                            
[50] grid_3.4.0                             
[51] RCurl_1.95-4.8                         
[52] igraph_1.0.1                           
[53] bitops_1.0-6                           
[54] boot_1.3-19                            
[55] gtable_0.2.0                           
[56] DBI_0.7                                
[57] reshape2_1.4.2                         
[58] R6_2.2.2                               
[59] GenomicAlignments_1.12.1               
[60] gridExtra_2.2.1                        
[61] dplyr_0.7.1                            
[62] rtracklayer_1.36.3                     
[63] bit_1.1-12                             
[64] bindr_0.1                              
[65] fastmatch_1.1-0                        
[66] fgsea_1.2.1                            
[67] KernSmooth_2.23-15                     
[68] GOSemSim_2.2.0                         
[69] stringi_1.1.5                          
[70] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
[71] Rcpp_0.12.11 
GuangchuangYu commented 7 years ago

The sameStrand was introduced for considering the orientation of the peak (see 1 and 2).

Not for identifying nearest gene that is exactly located at the same strand.

Maybe I should deprecate the sameStrand parameter and instead using peak_strandness.

balwierz commented 7 years ago

Thanks for clarification @GuangchuangYu. Literally what I need is that my "peaks" have strand information. And I want to annotate them with genes only on the same strand, while ignoring genes on the other strand. An easy implementation would be to separate my peaks to two groups on both strands, do the same to the TxDb, and use annotatePeak twice for (+,+) and (-,-) pairs and finally merge the results. However, I don't know how to subdivide TxDb object into two based on strand....

Do you know the way? Or can you recommend (or ideally implement) a solution?

Cheers, Piotr

HaleDM commented 3 years ago

Hi @balwierz, I'm dealing with this exact issue and I'm curious if you found a satisfactory solution. I've been playing with your idea of splitting your peak file by strandedness, but doing the similar split for TxDb doesn't seem to be tenable. I posed this question on bioconductor and the best solution I received was to slim the TxDb into a GRanges of transcripts, which then can be subset by strand, but this seemed to impair some of Chipseekers function in annotatePeak (addFlankGeneinfo= TRUE stopped returning all nearby genes).

mirkocelii commented 3 years ago

Hi @balwierz ! I guess you have already found a way in the meanwhile :D !

I had the same problem and I created two separate TxDb for "+ genes" and "- genes", then I did 2 separate annotation and in each output I selected only the lines that display the same strand of the TxDb used, then I merged the tables. I hope is correct

library("org.Hs.eg.db")
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
TxDb = TxDb.Hsapiens.UCSC.hg38.knownGene

#covert TxDb to list object
aa=as.list(TxDb)
sapply(aa,nrow) ; 
lapply(aa,head)

# create strand specific lists 
a.fw = aa
a.rv = aa

table(aa [[ "transcripts" ]]$"tx_strand")
table(aa [[ "splicings"   ]]$"exon_strand")

# identify + and - genes, transcripts and exons.
w_aa.trans.fw = which( aa[["transcripts"]]$"tx_strand"=="+")
w_aa.trans.rv = which( aa[["transcripts"]]$"tx_strand"=="-")

w_aa.splic.fw = which( aa[["splicings"]]$"exon_strand"=="+")
w_aa.splic.rv = which( aa[["splicings"]]$"exon_strand"=="-")

tx_id.fw = aa[["transcripts"]]$"tx_id"[w_aa.trans.fw]
tx_id.rv = aa[["transcripts"]]$"tx_id"[w_aa.trans.rv]

table(aa [[ "transcripts" ]]$"tx_strand")
length(tx_id.fw)
length(tx_id.rv)

table(aa [[ "splicings" ]]$"exon_strand")
length(tx_id.fw)
length(tx_id.rv)

# keep  "+" lines only in "FW" TxDB-list  and  "-" lines only in the "RV" TxDB-list
a.fw[[ "transcripts" ]] = a.fw[[ "transcripts" ]] [ w_aa.trans.fw ,  ]
a.rv[[ "transcripts" ]] = a.rv[[ "transcripts" ]] [ w_aa.trans.rv ,  ]

a.fw[[ "splicings" ]] = a.fw[[ "splicings" ]] [ w_aa.splic.fw ,  ]
a.rv[[ "splicings" ]] = a.rv[[ "splicings" ]] [ w_aa.splic.rv ,  ]

a.fw[[ "genes" ]] = a.fw[[ "genes" ]] [  a.fw[[ "genes" ]]$"tx_id"  %in% tx_id.fw ,  ]
a.rv[[ "genes" ]] = a.rv[[ "genes" ]] [  a.rv[[ "genes" ]]$"tx_id" %in% tx_id.rv ,  ]

sapply(aa  ,nrow) ; sapply(a.fw,nrow) ; sapply(a.rv,nrow) ;

# create  the 2 strand specific databases
TxDb.fw =do.call(makeTxDb, a.fw)
TxDb.rv =do.call(makeTxDb, a.rv)

# run chipseeker      
peakAnno.fw = annotatePeak("My_file", tssRegion=c(-3000, 3000), TxDb=TxDb.fw, annoDb="org.Hs.eg.db" )
peakAnno.rv = annotatePeak("My_file", tssRegion=c(-3000, 3000), TxDb=TxDb.rv, annoDb="org.Hs.eg.db" )

peakAnno.fw = as.data.table(peakAnno.fw)
peakAnno.rv = as.data.table(peakAnno.rv)

nrow(peakAnno.fw)
nrow(peakAnno.rv)

peakAnno.fw_filt = peakAnno.fw [ peakAnno.fw$"strand.1"=="+"]
peakAnno.rv_filt = peakAnno.rv [ peakAnno.rv$"strand.1"=="-"]

# merge and sort 
peakAnno.str.spe = rbind(peakAnno.fw_filt,peakAnno.rv_filt)
peakAnno.str.spe = peakAnno.str.spe[ order(peakAnno.str.spe$"seqnames",peakAnno.str.spe$"start", peakAnno.str.spe$"geneStart") , ]
MWSchmid commented 5 months ago

mirkocelii, thanks for the code. I encountered the same problem. The argument and its description are indeed very misleading...