broadinstitute / oncotator

Other
67 stars 33 forks source link

Reverse Oncotator #284

Open alexramos opened 9 years ago

alexramos commented 9 years ago

Wouldn't it be neat if one could go from protein position to genomic nucleotide position?

alexramos commented 9 years ago
def determine_exon_index_offset_from_exon_position(tx_coords, tx_pos):
    """Takes in position in exon space and determines exon index (0-based)
    and offset from exon start
    """
    if tx_pos < tx_coords[0][0][0] or tx_pos > tx_coords[-1][-1][-1]:
        raise Exception('input tx_pos is outside of tx_coords bounds!')
    for i, exon in enumerate(tx_coords):
        if tx_pos >= exon[0][0] and tx_pos <= exon[0][1]:
            return i, tx_pos - exon[0][0]

def determine_genomic_position_from_exon_index_offset(genomics_coords, exon_idx, exon_pos_offset, tx_strand):
    exon_start, exon_end, exon_n_str = genomics_coords[exon_idx]
    if tx_strand == '+':
        return exon_start + exon_pos_offset
    else:
        return exon_end - exon_pos_offset

def convert_exon_space_to_genomic_space(exon_pos, exon_genomic_coords, exon_tx_coords, tx_strand):
    exon_idx, exon_pos_offset = determine_exon_index_offset_from_exon_position(exon_tx_coords, exon_pos)
    return determine_genomic_position_from_exon_index_offset(exon_genomic_coords, exon_idx, exon_pos_offset, tx_strand)

def translate_codon(codon_seq_list, tx_strand):
    codon_seq_list = ''.join(codon_seq_list)
    return Seq.translate(Seq.reverse_complement(codon_seq_list)) if tx_strand == '-' else Seq.translate(codon_seq_list)

def get_possible_mutations(gene, prot_pos, prot_ref, prot_alt, oncotator):
    txs = oncotator.annotator.retrieve_transcripts_by_genes([gene])
    valid_txs = list()
    for tx in txs:
        prot_seq = tx.get_protein_seq()
        if prot_pos > len(prot_seq):
            continue
        if prot_seq[prot_pos - 1] == prot_ref:
            valid_txs.append(tx)

    if not valid_txs:
        raise Exception("No transcripts found with ref allele '%s' at position '%d'" % (prot_ref, prot_pos))

    codon_cds_start, codon_cds_end = prot_pos * 3 - 2, prot_pos * 3
    codon_cds_start, codon_cds_end = codon_cds_start - 1, codon_cds_end - 1 #for 0-based indexing

    tx = valid_txs[0] #only need one tx that is valid to get genomic positions
    cds_offset = oncotator.TranscriptProviderUtils.determine_cds_in_exon_space(tx)[0]

    codon_exon_start, codon_exon_end = codon_cds_start + cds_offset, codon_cds_end + cds_offset

    tx_strand = tx.get_strand()
    tx_contig = tx.get_contig()
    genomic_coords = tx.get_exons()
    if tx_strand == '-':
        tx_coords = [[oncotator.TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0]+1, exon[1], tx)] for exon in tx.get_exons()]
    else:
        tx_coords = [[oncotator.TranscriptProviderUtils.convert_genomic_space_to_exon_space(exon[0], exon[1]-1, tx)] for exon in tx.get_exons()]

    codon_coords = list()
    for coord in [codon_exon_start, codon_exon_start + 1, codon_exon_end]:
        # get genomic position for each base in codon
        codon_coords.append(convert_exon_space_to_genomic_space(coord, genomic_coords, tx_coords, tx_strand))

    if tx_strand == '-':
        codon_coords = codon_coords[::-1] # flip them around!
    else:
        codon_coords = [c + 1 for c in codon_coords]
        #not sure why this is necessary for positive strand transcripts but it is

    hg19 = oncotator.annotator._datasources[0] # We know that first element is a ReferenceDatasource from the way DatasourceFactory sorts datasources

    #get reference sequence from codon coords
    ref_codon_seq = list()
    for codon_base_coord in codon_coords:
        ref_codon_seq.append(hg19.getRange(tx_contig, codon_base_coord - 1, codon_base_coord - 1))

    #do sanity check that this codon translates to prot_ref
    if prot_ref != translate_codon(ref_codon_seq, tx_strand):
        raise Exception('We have a big problem here sir.')

    possible_mutations = list()
    for i, codon_base in enumerate(ref_codon_seq):
        #for each base in codon, determine AA if mutated
        other_bases = [b for b in 'ACTG' if b != codon_base]
        for other_base in other_bases:
            mutant_codon = list(ref_codon_seq) #copy
            mutant_codon[i] = other_base
            if prot_alt == translate_codon(mutant_codon, tx_strand):
                #create mutation object if mutation should lead to prot_alt
                mut = oncotator.MutationData(chr=tx_contig, start=codon_coords[i], end=codon_coords[i],
                    ref_allele=codon_base, alt_allele=other_base, build='h19')
                possible_mutations.append(mut)

    for m in possible_mutations:
        m.print_annotations()

    return possible_mutations