broadinstitute / gnomad_methods

Hail helper functions for the gnomAD project and Translational Genomics Group
https://gnomad.broadinstitute.org
MIT License
89 stars 29 forks source link

Add `assemble_constraint_context_ht` function to create a fully annotated context HT for computing constraint on #733

Open jkgoodrich opened 1 month ago

jkgoodrich commented 1 month ago

Also adds some functions to help transform the methylation data for annotation onto the context HT:

Tested with:

from gnomad.resources.grch38.gnomad import coverage, all_sites_an, public_release
from gnomad.resources.grch38.reference_data import methylation_sites, vep_context

context_ht = vep_context.ht()
coverage_hts = {
    "exomes": coverage("exomes").ht(),
    "genomes": coverage("genomes").ht(),
}
an_hts = {
    "exomes": all_sites_an("exomes").ht(),
    "genomes": all_sites_an("genomes").ht(),
}
exome_ht = public_release("exomes").ht()
genomes_ht = public_release("genomes").ht()
freq_hts = {
    "exomes": exome_ht.select("freq"),
    "genomes": genomes_ht.select("freq"),
}
filter_hts = {
    "exomes": exome_ht.select("filters"),
    "genomes": genomes_ht.select("filters"),
}
methylation_ht = methylation_sites.ht()
gerp_ht = hl.experimental.load_dataset(name="gerp_scores", version="hg19", reference_genome="GRCh38")

annotated_context_ht = assemble_constraint_context_ht(
    context_ht,
    coverage_hts=coverage_hts,
    an_hts=an_hts,
    freq_hts=freq_hts,
    filter_hts=filter_hts,
    methylation_ht=methylation_ht,
    gerp_ht=gerp_ht,
    transformation_funcs=None,
)

annotated_context_ht.describe()
annotated_context_ht.show()

and

from gnomad.resources.grch37.gnomad import coverage, public_release
from gnomad.resources.grch37.reference_data import methylation_sites, vep_context

context_ht = vep_context.ht()
coverage_hts = {
    "exomes": coverage("exomes").ht(),
    "genomes": coverage("genomes").ht(),
}
an_hts = None
exome_ht = public_release("exomes").ht()
genomes_ht = public_release("genomes").ht()
freq_hts = {
    "exomes": exome_ht.select("freq"),
    "genomes": genomes_ht.select("freq"),
}
filter_hts = {
    "exomes": exome_ht.select("filters"),
    "genomes": genomes_ht.select("filters"),
}
methylation_ht = methylation_sites.ht()
gerp_ht = hl.experimental.load_dataset(name="gerp_scores", version="hg19", reference_genome="GRCh37")

annotated_context_ht = assemble_constraint_context_ht(
    context_ht,
    coverage_hts=coverage_hts,
    an_hts=an_hts,
    freq_hts=freq_hts,
    filter_hts=filter_hts,
    methylation_ht=methylation_ht,
    gerp_ht=gerp_ht,
    transformation_funcs=None,
)

annotated_context_ht.describe()
annotated_context_ht.show()