Open danking opened 11 months ago
Here's an implementation of a split SVCF-VCF courtesy of Tim P:
import hail as hl
from constants import *
hl.init(log='/tmp/hail.log')
vds = hl.vds.read_vds(vds_path)
hl.get_reference('GRCh38').add_sequence(get_fasta())
# load metadata structure from arbitrary input GVCF
metadata = hl.get_vcf_metadata(gvcf_paths[0])
metadata['format']['LEN'] = {
'Description': 'Reference block length',
'Number': '1',
'Type': 'Integer',
}
# create reference VCF
rd = vds.reference_data
rd = rd.key_rows_by(locus=rd.locus, alleles=[rd.locus.sequence_context()])
rd = rd.transmute_entries(LEN=rd.END - rd.locus.position + 1)
hl.export_vcf(rd, reference_svcr_vcf_path, metadata=metadata,
tabix=True)
# create variant VCF
vd = vds.variant_data
# recode gvcf_info struct to top-level fields for compatibility with VCF format limitations
info_fields = list(vd.gvcf_info)
mt = vd.transmute_entries(**{f'INFO_{x}': vd.gvcf_info[x] for x in info_fields})
# recode boolean info fields as integers to support VCF spec
bool_fields = [fd for fd in mt.entry if mt[fd].dtype == hl.tbool]
mt = mt.transmute_entries(**{fd: hl.int(mt[fd]) for fd in bool_fields})
def transform_number(number):
if number in {'A', 'R', 'G'}:
return f'LOCAL_{number}'
return number
for info_fd in info_fields:
info = metadata['info'][info_fd]
if info['Type'] == 'Flag':
info['Type'] = 'Integer'
info['Number'] = '1'
metadata['format'][f'INFO_{info_fd}'] = info
for d in metadata['format'].values():
d['Number'] = transform_number(d.get('Number'))
hl.export_vcf(mt, variant_svcr_vcf_path, metadata=metadata,
tabix=True)
What happened?
This issue is complete when:
hl.vds.export_svcr_vcf
with clear documentation and examples. The examples should include ashow
of the VDS variant and reference data as well as acat
of the SVCR VCF.hl.vds.import_svcf_vcf
split=True
/=False
parameter which allow the user to choose between a two VCF and one VCF representation.NB: We have decided that SVCR-VCF should use "LEN" not "END"
Version
0.2.126
Relevant log output
No response