broadinstitute / seqr-loading-pipelines

hail-based pipelines for annotating variant callsets and exporting them to elasticsearch
MIT License
22 stars 20 forks source link

update biotype and consequence enums ids #825

Closed bpblanken closed 3 months ago

bpblanken commented 3 months ago

Plan is to merge and then run the following for 37 & 38 SNV_INDEL + MITO (since this enum is shared).

from v03_pipeline.lib.annotations.enums import (
    BIOTYPES,
    FIVEUTR_CONSEQUENCES,
    LOF_FILTERS,
    TRANSCRIPT_CONSEQUENCE_TERMS,
)
BIOTYPE_LOOKUP = hl.dict(hl.enumerate(BIOTYPES, index_first=False))
TRANSCRIPT_CONSEQUENCE_TERMS_LOOKUP = hl.dict(
    hl.enumerate(TRANSCRIPT_CONSEQUENCE_TERMS, index_first=False),
)

BIOTYPE_REVERSE_LOOKUP = hl.dict(hl.enumerate(ht.enums.sorted_transcript_consequences.biotype, index_first=True))
TRANSCRIPT_CONSEQUENCE_TERMS_REVERSE_LOOKUP = hl.dict(hl.enumerate(ht.enums.sorted_transcript_consequences.consequence_term, index_first=True))

ht = ht.annotate(
    sorted_transcript_consequences = ht.sorted_transcript_consequences.map(
        lambda c: c.annotate(
            biotype_id=BIOTYPE_LOOKUP[BIOTYPE_REVERSE_LOOKUP[c.biotype_id]], 
            consequence_terms_ids=c.consequence_term_ids.map(lambda t: TRANSCRIPT_CONSEQUENCE_TERMS_LOOKUP[TRANSCRIPT_CONSEQUENCE_TERMS_REVERSE_LOOKUP[t]])
        )
    )
)

ht = ht.annotate_globals(
    enums=ht.enums.annotate(
        sorted_transcript_consequences=ht.sorted_transcript_consequences.annotate(
            biotype=BIOTYPES,
            consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS,
        )
    )
)