Open alexramos opened 9 years ago
def determine_exon_index_offset_from_exon_position(tx_coords, tx_pos):
"""Takes in position in exon space and determines exon index (0-based)
and offset from exon start
"""
if tx_pos < tx_coords[0][0][0] or tx_pos > tx_coords[-1][-1][-1]:
raise Exception('input tx_pos is outside of tx_coords bounds!')
for i, exon in enumerate(tx_coords):
if tx_pos >= exon[0][0] and tx_pos <= exon[0][1]:
return i, tx_pos - exon[0][0]
def determine_genomic_position_from_exon_index_offset(genomics_coords, exon_idx, exon_pos_offset, tx_strand):
exon_start, exon_end, exon_n_str = genomics_coords[exon_idx]
if tx_strand == '+':
return exon_start + exon_pos_offset
else:
return exon_end - exon_pos_offset
def convert_exon_space_to_genomic_space(exon_pos, exon_genomic_coords, exon_tx_coords, tx_strand):
exon_idx, exon_pos_offset = determine_exon_index_offset_from_exon_position(exon_tx_coords, exon_pos)
return determine_genomic_position_from_exon_index_offset(exon_genomic_coords, exon_idx, exon_pos_offset, tx_strand)
def translate_codon(codon_seq_list, tx_strand):
codon_seq_list = ''.join(codon_seq_list)
return Seq.translate(Seq.reverse_complement(codon_seq_list)) if tx_strand == '-' else Seq.translate(codon_seq_list)
def get_possible_mutations(gene, prot_pos, prot_ref, prot_alt, oncotator):
txs = oncotator.annotator.retrieve_transcripts_by_genes([gene])
valid_txs = list()
for tx in txs:
prot_seq = tx.get_protein_seq()
if prot_pos > len(prot_seq):
continue
if prot_seq[prot_pos - 1] == prot_ref:
valid_txs.append(tx)
if not valid_txs:
raise Exception("No transcripts found with ref allele '%s' at position '%d'" % (prot_ref, prot_pos))
codon_cds_start, codon_cds_end = prot_pos * 3 - 2, prot_pos * 3
codon_cds_start, codon_cds_end = codon_cds_start - 1, codon_cds_end - 1 #for 0-based indexing
tx = valid_txs[0] #only need one tx that is valid to get genomic positions
cds_offset = oncotator.TranscriptProviderUtils.determine_cds_in_exon_space(tx)[0]
codon_exon_start, codon_exon_end = codon_cds_start + cds_offset, codon_cds_end + cds_offset
tx_strand = tx.get_strand()
tx_contig = tx.get_contig()
genomic_coords = tx.get_exons()
if tx_strand == '-':
tx_coords = [[oncotator.TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0]+1, exon[1], tx)] for exon in tx.get_exons()]
else:
tx_coords = [[oncotator.TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0], exon[1]-1, tx)] for exon in tx.get_exons()]
codon_coords = list()
for coord in [codon_exon_start, codon_exon_start + 1, codon_exon_end]:
# get genomic position for each base in codon
codon_coords.append(convert_exon_space_to_genomic_space(coord, genomic_coords, tx_coords, tx_strand))
if tx_strand == '-':
codon_coords = codon_coords[::-1] # flip them around!
else:
codon_coords = [c + 1 for c in codon_coords]
#not sure why this is necessary for positive strand transcripts but it is
hg19 = oncotator.annotator._datasources[0] # We know that first element is a ReferenceDatasource from the way DatasourceFactory sorts datasources
#get reference sequence from codon coords
ref_codon_seq = list()
for codon_base_coord in codon_coords:
ref_codon_seq.append(hg19.getRange(tx_contig, codon_base_coord - 1, codon_base_coord - 1))
#do sanity check that this codon translates to prot_ref
if prot_ref != translate_codon(ref_codon_seq, tx_strand):
raise Exception('We have a big problem here sir.')
possible_mutations = list()
for i, codon_base in enumerate(ref_codon_seq):
#for each base in codon, determine AA if mutated
other_bases = [b for b in 'ACTG' if b != codon_base]
for other_base in other_bases:
mutant_codon = list(ref_codon_seq) #copy
mutant_codon[i] = other_base
if prot_alt == translate_codon(mutant_codon, tx_strand):
#create mutation object if mutation should lead to prot_alt
mut = oncotator.MutationData(chr=tx_contig, start=codon_coords[i], end=codon_coords[i],
ref_allele=codon_base, alt_allele=other_base, build='h19')
possible_mutations.append(mut)
for m in possible_mutations:
m.print_annotations()
return possible_mutations
Wouldn't it be neat if one could go from protein position to genomic nucleotide position?