yctuga / BINF8211

0 stars 0 forks source link

HW6 #6

Open yctuga opened 3 years ago

yctuga commented 3 years ago

HW6 stuffs located at /work/binf8211/yct/HW6 file downloaded from NCBI at /work/binf8211/yct/HW6/ncbi

two way to identify variant

  1. by click left panel to examine the answer for each question

  2. use following procedure to extract specific information a. download data file from NCBI (https://www.ncbi.nlm.nih.gov/variation/view/?q=rs1042522; with single nucleotide variant option on left panel) b. copy file on teaching cluster, example: SNPs.txt c. use command to get the answer $ cat SNPs.txt |awk 'BEGIN{FS="\t";OFS="\t"}{if($3=="single nucleotide variant"){print $0}}' | cut -f4 |grep ^TP53 | wc -l SNPs.txt: input file {if($3=="single nucleotide variant"){print $0}}: search line include specific variant cut -f4: cut column 4 grep ^TP53: just double check each line have specific gene wc -l: count the total number

I use method 1 to get the answer (please see the attached photo https://drive.google.com/file/d/1f_DTfvEaV7P-9Jfp1_kEIdeqt7gG6R2I/view?usp=sharing https://drive.google.com/file/d/1XdCiWJb6rpxKDIuEbw-K0ajzfu1aRM7U/view?usp=sharing

Q1.1 How many SNPs are identified? 5124 single nucleotide variant are identified

Q1.2 exons:5124-4564=560 introns: 4564 5'-UTR: 456 3'-UTR: 471

Q1.3 synonymous: 238 nonsynomymous:5124-238=4886 missense: 608 nonsense: 74 frameshift changes: 201

Q2 GATK files are located at /work/binf8211/yct/HW6 all result files are located at /work/binf8211/yct/HW6/results

  1. run 3 scripts by following order pre.GATK.sh GATK.sh post.GATK.sh

2.generate 3 log file log.14984 log.14985 log.14989

gunzip result file

use the following command to calculate $ cat HW6.output.vcf | grep ^chr | wc -l There are 191426 mutation

yctuga commented 3 years ago

!/bin/bash

SBATCH --job-name=GATK # Job name

SBATCH --partition=batch # Partition (queue) name

SBATCH --ntasks=1 # Single task job

SBATCH --cpus-per-task=4 # Number of cores per task

SBATCH --mem=20gb # Total memory for job

SBATCH --time=48:00:00 # Time limit hrs:min:sec

SBATCH --output=log.%j # Standard output and error log

SBATCH --mail-user=yct@uga.edu # Where to send mail

SBATCH --mail-type=END,FAIL # Mail events (BEGIN, END, FAIL, ALL)

define directory

result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/SRR13187475.bam'

bwa_source='/work/binf8211/instructor_data/HomeWork4'

gtf='/work/binf8211/instructor_data/Homo_sapiens.GRCh38.102.chrV1_TSS.gtf'

data='/home/yct/BINF8211/HW4/data/'

module load

module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11

data preprocessing

cd $result

step1: selecting only the concordantly mapped pairs

samtools view $bam | awk 'BEGIN{FS="\t";OFS="\t"}{if ($2%16==3){print $0}}' > SRR13187475_conc.sam samtools view -H $bam > SRR13187475_header cat SRR13187475_header SRR13187475_conc.sam >foo mv foo SRR13187475_conc.sam

step2: convert sam to bam

samtools view -bS SRR13187475_conc.sam > SRR13187475_conc.bam rm SRR13187475_conc.sam

step3: picard to add readsgroups

java -jar $EBROOTPICARD/picard.jar AddOrReplaceReadGroups \ I=SRR13187475_conc.bam \ O=HW6.bam \ RGID=4 \ RGLB=lib1 \ RGPL=ILLUMINA \ RGPU=unit1 \ RGSM=20

yctuga commented 3 years ago

!/bin/bash

SBATCH --job-name=GATK.2 # Job name

SBATCH --partition=batch # Partition (queue) name

SBATCH --ntasks=1 # Single task job

SBATCH --cpus-per-task=4 # Number of cores per task

SBATCH --mem=20gb # Total memory for job

SBATCH --time=48:00:00 # Time limit hrs:min:sec

SBATCH --output=log.%j # Standard output and error log

SBATCH --mail-user=yct@uga.edu # Where to send mail

SBATCH --mail-type=END,FAIL # Mail events (BEGIN, END, FAIL, ALL)

define directory

result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/SRR13187475.bam'

module load

module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11

data preprocessing

cd $result

sort

mkdir tmp samtools sort -T tmp/ -o HW6_sort.bam HW6.bam rm -r tmp mv HW6_sort.bam HW6.bam

java -jar $EBROOTPICARD/picard.jar MarkDuplicates \ I=HW6.bam \ O=marked_duplicates.bam \ M=marked_dup_metrics.txt

sort

mkdir tmp samtools sort -T tmp/ -o marked_duplicates_sort.bam marked_duplicates.bam rm -r tmp mv marked_duplicates_sort.bam marked_duplicates.bam

step Base (Quality Score) Recalibration

gatk BaseRecalibrator -I marked_duplicates.bam -R $genome --known-sites $snp -O recal_data.table

gatk ApplyBQSR -R $genome -I marked_duplicates.bam --bqsr-recal-file $result/recal_data.table -O $result/r_gatk.bam

yctuga commented 3 years ago

!/bin/bash

SBATCH --job-name=GATK.3 # Job name

SBATCH --partition=batch # Partition (queue) name

SBATCH --ntasks=1 # Single task job

SBATCH --cpus-per-task=4 # Number of cores per task

SBATCH --mem=20gb # Total memory for job

SBATCH --time=48:00:00 # Time limit hrs:min:sec

SBATCH --output=log.%j # Standard output and error log

SBATCH --mail-user=yct@uga.edu # Where to send mail

SBATCH --mail-type=END,FAIL # Mail events (BEGIN, END, FAIL, ALL)

define directory

result='/home/yct/BINF8211/HW6/result/' genome='/work/binf8211/instructor_data/HomeWork4/hg38.fa' snp='/work/binf8211/instructor_data/HomeWork6/resources_broad_hg38_v0_Homo_sapiens_assembly38.dbsnp138.vcf' bam='/home/yct/BINF8211/HW4/result/r_gatk.bam'

module load

module load SAMtools/1.10-iccifort-2019.5.281 module load GATK/4.1.6.0-GCCcore-8.3.0-Java-1.8 module load picard/2.21.6-Java-11

cd $result

haplotypecaller

gatk --java-options "-Xmx4g" HaplotypeCaller \ -R $genome \ -I $bam \ -O $result/HW6.output.g.vcf.gz \ -ERC GVCF

genotypeGVCFs

gatk --java-options "-Xmx4g" GenotypeGVCFs \ -R $genome \ -V $result/HW6.output.g.vcf.gz \ -O $result/HW6.output.vcf.gz