from sgkit.io.vcf import vcf_to_zarr
import sgkit as sg
import tsinfer
import numpy as np
from dask.diagnostics import ProgressBar
with ProgressBar():
vcf_to_zarr(
"Downloads/ALL.chr21.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz",
"output.zarr",
regions=["21:1-10000000"],
fields = ["INFO/AA", "FORMAT/GT"], # saves the AA field for ancestral alleles into variant_AA
)
# load and specify the ancestral allele for tsinfer
ds = sg.load_dataset("output.zarr")
ancestral = np.char.partition(ds.variant_AA.astype(str), sep="|")
ds.update({'variant_ancestral_allele': ancestral[:,0]})
# Append only the new variables to the existing dataset (
sg.save_dataset(ds.drop_vars(set(ds.data_vars)- {"variant_ancestral_allele"}), "output.zarr", mode="a")
sd = tsinfer.SgkitSampleData(path="output.zarr")
ts = tsinfer.infer(sd)
--> 486 generator.add_sites(exclude_positions)
487 ancestor_data = generator.run()
488 for timestamp, record in sample_data.provenances():
File ~/mambaforge/lib/python3.10/site-packages/tsinfer/inference.py:1080, in AncestorsGenerator.add_sites(self, exclude_positions)
1078 progress = self.progress_monitor.get("ga_add_sites", self.max_sites)
1079 inference_site_id = []
-> 1080 for variant in self.sample_data.variants(recode_ancestral=True):
1081 # If there's missing data the last allele is None
1082 num_alleles = len(variant.alleles) - int(variant.alleles[-1] is None)
1084 counts = allele_counts(variant.genotypes)
File ~/mambaforge/lib/python3.10/site-packages/tsinfer/formats.py:2604, in SgkitSampleData.variants(self, sites, recode_ancestral)
2602 if allele != b"" and allele != "":
2603 if empty_seen:
-> 2604 raise ValueError("Empty alleles must be at the end")
2605 non_empty_alleles.append(allele)
2606 else:
ValueError: Empty alleles must be at the end
This seems to be a site at pos 9412076 where alleles = ('CTT', 'C', '', ''), but where the ancestral allele is ''. This is a reasonable value for an ancestral allele, I suppose (ancestral for a deletion)
E.g. using the new SGKit pipeline (files from http://hgdownload.cse.ucsc.edu/gbdb/hg19/1000Genomes/phase3/). I think we should allow reading of these files by default.