Closed ixxmu closed 1 year ago
我们知道真核生物的基因组DNA以染色质的形式存在。真核生物DNA上基因表达是一系列与DNA互作的蛋白质介导的,研究蛋白质与DNA在染色质环境下的相互作用是阐明核生物DNA上基因表达机制的基本途径。染色质免疫共沉淀(chromatin immunoprecipitation assay, ChIP)是目前研究体内DNA与蛋白质相互作用的一种常用的实验方法。它的基本原理是首先固定在活细胞状态下蛋白质-DNA复合物,并将其随机切断为一定长度范围内的染色质小片段,然后通过免疫学方法沉淀此复合体,特异性地富集目的蛋白结合的DNA片段,通过对目的片断的纯化与检测,从而获得蛋白质与DNA相互作用的信息。该技术主要应用于:组蛋白的修饰研究;染色质表观遗传修饰研究;转录调控分析;药物开发研究;有丝分裂研究;DNA损伤与凋亡分析等。
ChIP常常与其他技术结合使用:
染色质免疫共沉淀测序(ChIP-seq):将ChIP与第二代测序技术相结合的ChIP-Seq技术,能够高效地在全基因组范围内检测与组蛋白、转录因子等互作的DNA区段。ChIP-Seq的原理是:首先通过染色质免疫共沉淀技术(ChIP)特异性地富集目的蛋白结合的DNA片段,并对其进行纯化与文库构建;然后对富集得到的DNA片段进行高通量测序。通过将获得的数百万条序列标签精确定位到基因组上,从而获得全基因组范围内与组蛋白、转录因子等互作的DNA区段信息。
实验流程:
生物信息分析流程
::::
Polycomb associates genome-wide with a specific RNA polymerase II variant, and regulates metabolic genes in ES cells PMID: 22305566 DOI: 10.1016/j.stem.2011.12.017
conda create -n chipseq python=2
conda info --envs
source activate chipseq
conda install -y bwa -c bioconda #比对工具
conda install -y trim-galore samtools -c bioconda # 质控和SAM,BAM文件工具
conda install -y deeptools homer meme -c bioconda #查看reads分布特征工具
conda install -y macs2 bowtie bowtie2 -c bioconda #call peak工具
R
options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor/")
options("repos"=c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
install.packages("devtools")
library(devtools)
BiocManager::install
BiocManager::install(c('airway','DESeq2','edgeR','limma'))
BiocManager::install(c('ChIPpeakAnno','ChIPseeker'))
BiocManager::install('TxDb.Hsapiens.UCSC.hg19.knownGene',
ask=F,suppressUpdates=T)
BiocManager::install('TxDb.Hsapiens.UCSC.hg38.knownGene',
ask=F,suppressUpdates=T)
BiocManager::install('TxDb.Mmusculus.UCSC.mm10.knownGene',
ask=F,suppressUpdates=T)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(ChIPpeakAnno)
library(ChIPseeker)
# 安装数据下载工具
mkdit SRAToolkit
cd SRAToolkit
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz
tar -zxvf sratoolkit.current-centos_linux64.tar.gz
echo 'export export PATH=$PATH:/home/SRAToolkit/sratoolkit.2.10.9-centos_linux64/bin' >> ~/.bashrc
source ~/.bashrc
数据描述:
## 注意组织好项目文件夹
mkdir -p ~/project/chip-seq/
cd ~/project/chip-seq/
mkdir {sra,raw,clean,align,peaks,motif,qc}
cd sra
## 根据SraRunTable创建 sra.list 文件, 保存数据ID信息
cat>config.sra
RNAPII_S5P_1 SRR391032
RNAPII_S5P_2 SRR391033
RNAPII_S2P_1 SRR391034
RNAPII_S7P_1 SRR391035
RNAPII_8WG16_1 SRR391036
RNAPII_8WG16_2 SRR391037
RNAPII_S2P_2 SRR391038
RNAPII_S2P_3 SRR391039
RNAPII_S7P_2 SRR391040
H2Aub1_1 SRR391041
H2Aub1_2 SRR391042
H3K36me3_1 SRR391043
H3K36me3_2 SRR391044
Control_1 SRR391045
Control_2 SRR391046
Ring1B_1 SRR391047
Ring1B_2 SRR391048
Ring1B_3 SRR391049
RNAPII_S5PRepeat_1 SRR391050
#创建sra.list文件
cat>sra.list
SRR391032
SRR391033
SRR391034
SRR391035
SRR391036
SRR391037
SRR391038
SRR391039
SRR391040
SRR391041
SRR391042
SRR391043
SRR391044
SRR391045
SRR391046
SRR391047
SRR391048
SRR391049
SRR391050
#批量下载sra
conda activate chipseq
cat sra.list |while read id;do ( nohup prefetch $id & );done
## 下面需要用循环
cd ~/project/chip-seq
conda activate chip-seq
## 下面用到的 config.sra 文件,就是上面自行制作的。
# 创建01_fastq-dump.sh脚本
vi 01_fastq-dump.sh
cat sra/config.sra |while read id;
do
arr=($id)
srr=${arr[1]}
sample=${arr[0]}
nohup fastq-dump sra/$srr.sra --gzip --split-3 -A $sample -O raw &
done
# 运行脚本
bash 01_fastq-dump.sh
fastq文件
创建02_trim_galore.sh脚本
vi 02_trim_galore.sh
cd ~/project/chip-seq/
conda activate chipseq
ls ../raw/*gz | while read fq1;
do
nohup trim_galore -q 25 --phred33 --length 25 -e 0.1 --stringency 4 -o ~/project/chip-seq/clean $fq1 &
done
sh 02_trim_galore.sh
过滤后fq文件
创建03_fastqc_multiqc.sh脚本
vi 03_fastqc_multiqc.sh
cd qc
mkdir -p clean
fastqc -t 5 ../clean/*gz -o clean/
mkdir -p raw
fastqc -t 5 ../raw/*gz -o raw/
multiqc raw/*zip
multiqc clean/*zip
sh 03_fastqc_multiqc.sh
过滤前
过滤后
下载索引文件
# 小鼠bowtie2索引大小为3.2GB,可以直接下载bowtie2索引文件,代码如下:
mkdir referece && cd reference
wget -c https://genome-idx.s3.amazonaws.com/bt/GRCm38.zip
unzip GRCm38.zip
创建04_align_bowtie2.sh脚本
vi 04_align_bowtie2.sh
cd ~/project/chip-seq/align
bowtie2_index=/reference/GRCm38/Mus_musculus.GRCm38 ## 索引文件
ls ../clean/*gz |while read id;
do
file=$(basename $id )
sample=${file%%.*}
echo $file $sample
bowtie2 -p 5 -x $bowtie2_index -U $id | samtools sort -O bam -@ 5 -o - > ${sample}.bam
done
sh 04_align_bowtie2.sh
对bam文件进行QC
cd ~/project/chip-seq/align
ls *.bam |xargs -i samtools index {}
ls *.bam | while read id ;do (nohup samtools flagstat $id > $(basename $id ".bam").stat & );done
合并bam文件
该数据一个样品分成了多个lane进行测序,所以在进行peaks calling前,需要把bam进行合并。
cd ~/project/chip-seq/
mkdir mergeBam
source activate chipseq
cd ~/project/chip-seq/align
ls *.bam|sed 's/_[0-9]_trimmed.bam//g' |sort -u |while read id;do samtools merge ../mergeBam/$id.merge.bam $id*.bam ;done
cd ~/project/chip-seq/mergeBam
ls *merge.bam |cut -d"." -f 1 |while read id;
do
if [ ! -s ${id}_summits.bed ];
then
echo $id
nohup macs2 callpeak -c Control.merge.bam -t $id.merge.bam -f BAM -B -g mm -n $id --outdir ../peaks 2> $id.log &
fi
done
首先把bam文件转为bw文件
cd ~/project/chip-seq/mergeBam
ls *.bam |xargs -i samtools index {}
ls *.bam |while read id;do
nohup bamCoverage --normalizeUsing CPM -b $id -o ${id%%.*}.bw &
done
查看单一样本TSS附近信号强度:
mkdir -p ~/project/chip-seq/tss
cd ~/project/chip-seq/tss
computeMatrix reference-point --referencePoint TSS -p 15 \
-b 1000 -a 1000 \
-R /home/Ensemble/Annotation/Mouse/ucsc.refseq.bed \
-S ~/project/chip-seq/mergeBam/H2Aub1.bw \
--skipZeros -o matrix1_test_TSS.gz \
--outFileSortedRegions regions1_test_genes.bed
## both plotHeatmap and plotProfile will use the output from computeMatrix
plotHeatmap -m matrix1_test_TSS.gz -out test_Heatmap.png
plotHeatmap -m matrix1_test_TSS.gz -out test_Heatmap.pdf --plotFileFormat pdf --dpi 720
plotProfile -m matrix1_test_TSS.gz -out test_Profile.png
plotProfile -m matrix1_test_TSS.gz -out test_Profile.pdf --plotFileFormat pdf --perGroup --dpi 720
10K附近的信号强度
创建 10k.sh
bed=/home/Ensemble/Annotation/Mouse/ucsc.refseq.bed
for id in ~/project/chip-seq/mergeBam/*bw ;
do
echo $id
file=$(basename $id )
sample=${file%%.*}
echo $sample
computeMatrix reference-point --referencePoint TSS -p 15 \
-b 10000 -a 10000 \
-R $bed \
-S $id \
--skipZeros -o matrix1_${sample}_TSS_10K.gz \
--outFileSortedRegions regions1_${sample}_TSS_10K.bed
# 输出的gz为文件用于plotHeatmap, plotProfile
## both plotHeatmap and plotProfile will use the output from computeMatrix
plotHeatmap -m matrix1_${sample}_TSS_10K.gz -out ${sample}_Heatmap_10K.png
plotHeatmap -m matrix1_${sample}_TSS_10K.gz -out ${sample}_Heatmap_10K.pdf --plotFileFormat pdf --dpi 720
plotProfile -m matrix1_${sample}_TSS_10K.gz -out ${sample}_Profile_10K.png
plotProfile -m matrix1_${sample}_TSS_10K.gz -out ${sample}_Profile_10K.pdf --plotFileFormat pdf --perGroup --dpi 720
done
nohup bash 10k.sh 1>10k.log &
2K附近的信号强度
创建 2k.sh
bed=/home/Ensemble/Annotation/Mouse/ucsc.refseq.bed
for id in ~/project/chip-seq/mergeBam/*bw ;
do
echo $id
file=$(basename $id )
sample=${file%%.*}
echo $sample
computeMatrix reference-point --referencePoint TSS -p 15 \
-b 2000 -a 2000 \
-R $bed \
-S $id \
--skipZeros -o matrix1_${sample}_TSS_2K.gz \
--outFileSortedRegions regions1_${sample}_TSS_2K.bed
## both plotHeatmap and plotProfile will use the output from computeMatrix
plotHeatmap -m matrix1_${sample}_TSS_2K.gz -out ${sample}_Heatmap_2K.png
plotHeatmap -m matrix1_${sample}_TSS_2K.gz -out ${sample}_Heatmap_2K.pdf --plotFileFormat pdf --dpi 720
plotProfile -m matrix1_${sample}_TSS_2K.gz -out ${sample}_Profile_2K.png
plotProfile -m matrix1_${sample}_TSS_2K.gz -out ${sample}_Profile_2K.pdf --plotFileFormat pdf --perGroup --dpi 720
done
nohup bash 2k.sh 1>2k.log &
ChIPpeakAnno注释peaks区间分布
R
bedPeaksFile = 'RNAPII_8WG16_summits.bed';
require(ChIPseeker)
require(TxDb.Mmusculus.UCSC.mm10.knownGene)
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
require(clusterProfiler)
peak <- readPeakFile( bedPeaksFile )
#BiocManager::install('diffloop')
peak <- diffloop::addchr(peak)
peakAnno <- annotatePeak(peak, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Mm.eg.db")
peakAnno_df <- as.data.frame(peakAnno)
head(peakAnno_df)
write.csv(peakAnno_df, "peakAnno_df.csv")
library(Cairo)
CairoPNG(file="AnnoPie.png",width=500,height=300)
plotAnnoPie(peakAnno)
dev.off()
CairoPNG(file="AnnoPie.png",width=500,height=300)
plotAnnoPie(peakAnno)
dev.off()
用homer软件来寻找motif
# 下载数据库
perl ~/miniconda3/envs/chipseq/share/homer-4.9.1-5/configureHomer.pl -install mm10
创建 runMotif.sh 脚本
cd ~/project/chip-seq/motif
for id in ~/project/chip-seq/peaks/*.bed;
do
echo $id
file=$(basename $id )
sample=${file%%.*}
echo $sample
awk '{print $4"\t"$1"\t"$2"\t"$3"\t+"}' $id >homer_peaks.tmp
findMotifsGenome.pl homer_peaks.tmp mm10 ${sample}_motifDir -len 8,10,12
annotatePeaks.pl homer_peaks.tmp mm10 1>${sample}.peakAnn.xls 2>${sample}.annLog.txt
done
nohup bash runMotif.sh 1>motif.log &
参考
https://mp.weixin.qq.com/s/UXVdCQQnEhWCd6X7mr2NoQ