This github repository includes jupyter notebooks (and associated static HTML files) for reproducing all plots in the paper, and R scripts for sequencing data analysis, in scripts/
The following bowtie indices / annotation files should be generated in new folders called bowtie_indices
and sequence_annotation_files
before running the script below. These files are not provided in the repository due to their large size, but instructions for generating them are included below.
hg38.rrna
genome_rsem/hg38.gencode.v24.rsem/bowtie
hg38.fa.masked
gencode.v24.canonical_ccds_transcripts.20170315.gff3
gencode.vM9.annotation.gff3.gz
rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.masked.gz .
gunzip hg38.fa.masked.gz
samtools faidx hg38.fa.masked
wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_24/gencode.v24.annotation.gff3.gz
gunzip gencode.v24.annotation.gff3.gz
Rscript make_unique_ccds_transcripts.R
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M9/gencode.vM9.annotation.gff3.gz
from Bio import Entrez
from Bio import SeqIO
Entrez.email = 'YOUR_EMAIL_ADDRESS@XYZ.EDU'
refseqids = {'28S' : 'NR_003287.2',
'18S' : 'NR_003286.2',
'5.8S' : 'NR_003285.2',
'5S' : 'NR_023363.1',
}
outputlist = []
for (name,ID) in refseqids.items():
# identify ids by searching
rec = Entrez.read(Entrez.esearch(db="nucleotide", term=ID))
# retrieve full record for the found id
handle = Entrez.efetch(db="nucleotide", id=rec["IdList"][0], rettype="fasta")
output = SeqIO.read(handle, 'fasta')
# clean up record to be minimalistic
output.id = ID
output.description = output.name = ''
outputlist.append(output)
SeqIO.write(outputlist, 'sequence_annotation_files/rrna.fa', 'fasta')
/app/bowtie/1.1.1/bowtie-build sequence_annotation_files/rrna.fa hg38.rrna
rsem-prepare-reference --bowtie-path /app/bowtie/1.1.1 --bowtie --gff3 sequence_annotation_files/gencode.v24.annotation.gff3 sequence_annotation_files/hg38.fa.masked bowtie
Starting with .fastq
files.
for SAMPLE in `ls rawdata/*.fq.gz | cut -f1 -d. | cut -f2 -d/`;
do
mkdir processeddata/$SAMPLE;
echo $SAMPLE;
done
for SAMPLE in `ls rawdata/*.fq.gz | cut -f1 -d. | cut -f2 -d/`;
do
cutadapt --adapter=AAAAAAAAAA --minimum-length=13 --discard-untrimmed --output processeddata/$SAMPLE/$SAMPLE.trim.fq rawdata/$SAMPLE.R1.fq.gz;
done
for SAMPLE in `ls processeddata/*/*.trim.fq | cut -f1 -d. | cut -f2 -d/`;
do
bowtie --seedlen=23 --threads=8 --un processeddata/$SAMPLE/norrna.fq --sam bowtie_indices/hg38.rrna processeddata/$SAMPLE/$SAMPLE.trim.fq;
done
for SAMPLE in `ls processeddata/*/*.norrna.fq | cut -f1 -d. | cut -f2 -d/`;
do
rsem-calculate-expression --num-threads 8 --output-genome-bam --sort-bam-by-coordinate processeddata/$SAMPLE/$SAMPLE.norrna.fq /n/osheafs1/LAB/alicia/bioinformatics/bowtie_indices/genome_rsem/hg38.gencode.v24.rsem/bowtie processeddata/$SAMPLE/gencode;
done
for SAMPLE in `ls processeddata/*/*norrna* | cut -f1 -d. | cut -f2 -d/`;
do
Rscript scripts/calculate_codon_ribo_density_150overhang_20170807.R $SAMPLE; echo $SAMPLE;
done
for SAMPLE in `ls processeddata/*/*norrna* | cut -f1 -d. | cut -f2 -d/`;
do
Rscript scripts/calculate_gene_fold_change_20170615.R $SAMPLE; echo $SAMPLE;
done