Open yctuga opened 3 years ago
result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/SRR13187475.bam'
module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11
cd $result
samtools view $bam | awk 'BEGIN{FS="\t";OFS="\t"}{if ($2%16==3){print $0}}' > SRR13187475_conc.sam samtools view -H $bam > SRR13187475_header cat SRR13187475_header SRR13187475_conc.sam >foo mv foo SRR13187475_conc.sam
samtools view -bS SRR13187475_conc.sam > SRR13187475_conc.bam rm SRR13187475_conc.sam
java -jar $EBROOTPICARD/picard.jar AddOrReplaceReadGroups \ I=SRR13187475_conc.bam \ O=HW6.bam \ RGID=4 \ RGLB=lib1 \ RGPL=ILLUMINA \ RGPU=unit1 \ RGSM=20
result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/SRR13187475.bam'
module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11
cd $result
mkdir tmp samtools sort -T tmp/ -o HW6_sort.bam HW6.bam rm -r tmp mv HW6_sort.bam HW6.bam
java -jar $EBROOTPICARD/picard.jar MarkDuplicates \ I=HW6.bam \ O=marked_duplicates.bam \ M=marked_dup_metrics.txt
mkdir tmp samtools sort -T tmp/ -o marked_duplicates_sort.bam marked_duplicates.bam rm -r tmp mv marked_duplicates_sort.bam marked_duplicates.bam
gatk BaseRecalibrator -I marked_duplicates.bam -R $genome --known-sites $snp -O recal_data.table
gatk ApplyBQSR -R $genome -I marked_duplicates.bam --bqsr-recal-file $result/recal_data.table -O $result/r_gatk.bam
result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/r_gatk.bam'
module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11
cd $result
gatk --java-options "-Xmx4g" HaplotypeCaller \ -R $genome \ -I $bam \ -O $result/HW6.output.g.vcf.gz \ -ERC GVCF
gatk --java-options "-Xmx4g" GenotypeGVCFs \ -R $genome \ -V $result/HW6.output.g.vcf.gz \ -O $result/HW6.output.vcf.gz
HW6 stuffs located at /work/binf8211/yct/HW6 file downloaded from NCBI at /work/binf8211/yct/HW6/ncbi
two way to identify variant
by click left panel to examine the answer for each question
use following procedure to extract specific information a. download data file from NCBI (https://www.ncbi.nlm.nih.gov/variation/view/?q=rs1042522; with single nucleotide variant option on left panel) b. copy file on teaching cluster, example: SNPs.txt c. use command to get the answer $ cat SNPs.txt |awk 'BEGIN{FS="\t";OFS="\t"}{if($3=="single nucleotide variant"){print $0}}' | cut -f4 |grep ^TP53 | wc -l SNPs.txt: input file {if($3=="single nucleotide variant"){print $0}}: search line include specific variant cut -f4: cut column 4 grep ^TP53: just double check each line have specific gene wc -l: count the total number
I use method 1 to get the answer (please see the attached photo https://drive.google.com/file/d/1f_DTfvEaV7P-9Jfp1_kEIdeqt7gG6R2I/view?usp=sharing https://drive.google.com/file/d/1XdCiWJb6rpxKDIuEbw-K0ajzfu1aRM7U/view?usp=sharing
Q1.1 How many SNPs are identified? 5124 single nucleotide variant are identified
Q1.2 exons:5124-4564=560 introns: 4564 5'-UTR: 456 3'-UTR: 471
Q1.3 synonymous: 238 nonsynomymous:5124-238=4886 missense: 608 nonsense: 74 frameshift changes: 201
Q2 GATK files are located at /work/binf8211/yct/HW6 all result files are located at /work/binf8211/yct/HW6/results
2.generate 3 log file log.14984 log.14985 log.14989
gunzip result file
use the following command to calculate $ cat HW6.output.vcf | grep ^chr | wc -l There are 191426 mutation