sgkit-dev / sgkit

Scalable genetics toolkit
https://sgkit-dev.github.io/sgkit
Apache License 2.0
231 stars 32 forks source link

Division by zero error with clinvar vcf #1067

Closed sofroniewn closed 1 year ago

sofroniewn commented 1 year ago

Hello

I am fairly new to working with vcf files so this may be a basic mistake that I am making, but I would have expected the following to work.

I am trying to read the ClinVar vcf file located at that page. I downloaded both clinvar.vcf.gz and clinvar.vcf.gz.tbi and then ran the following with sgkit 0.6.0

from sgkit.io.vcf import vcf_to_zarr
vcf_to_zarr("clinvar.vcf.gz", "clinvar.zarr")

and got the following error

---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
Cell In[11], line 2
      1 from sgkit.io.vcf import vcf_to_zarr
----> 2 vcf_to_zarr("clinvar.vcf.gz", "clinvar.zarr")

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/sgkit/io/vcf/vcf_reader.py:959, in vcf_to_zarr(input, output, target_part_size, regions, chunk_length, chunk_width, compressor, encoding, temp_chunk_length, tempdir, tempdir_storage_options, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles, fields, exclude_fields, field_defs)
    952 else:
    953     convert_func = functools.partial(
    954         vcf_to_zarr_parallel,
    955         temp_chunk_length=temp_chunk_length,
    956         tempdir=tempdir,
    957         tempdir_storage_options=tempdir_storage_options,
    958     )
--> 959 convert_func(
    960     input,  # type: ignore
    961     output,
    962     regions,  # type: ignore
    963     chunk_length=chunk_length,
    964     chunk_width=chunk_width,
    965     compressor=compressor,
    966     encoding=encoding,
    967     ploidy=ploidy,
    968     mixed_ploidy=mixed_ploidy,
    969     truncate_calls=truncate_calls,
    970     max_alt_alleles=max_alt_alleles,
    971     fields=fields,
    972     exclude_fields=exclude_fields,
    973     field_defs=field_defs,
    974 )
    976 # Issue a warning if max_alt_alleles caused data to be dropped
    977 ds = zarr.open(output)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/sgkit/io/vcf/vcf_reader.py:639, in vcf_to_zarr_parallel(input, output, regions, chunk_length, chunk_width, compressor, encoding, temp_chunk_length, tempdir, tempdir_storage_options, ploidy, mixed_ploidy, truncate_calls, max_alt_alleles, fields, exclude_fields, field_defs)
    617 with temporary_directory(
    618     prefix="vcf_to_zarr_", dir=tempdir, storage_options=tempdir_storage_options
    619 ) as tmpdir:
    621     paths = vcf_to_zarrs(
    622         input,
    623         tmpdir,
   (...)
    636         field_defs=field_defs,
    637     )
--> 639     concat_zarrs(
    640         paths,
    641         output,
    642         storage_options=tempdir_storage_options,
    643     )

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/sgkit/io/vcf/vcf_reader.py:817, in concat_zarrs(urls, output, storage_options)
    814     else:
    815         vars_to_copy.append(var)
--> 817 concat_zarrs_optimized(urls, output, vars_to_rechunk, vars_to_copy)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/sgkit/io/vcfzarr_reader.py:410, in concat_zarrs_optimized(zarr_files, output, vars_to_rechunk, vars_to_copy, fix_strings)
    407     d = _fuse_delayed(d)  # type: ignore[no-untyped-call]
    408     delayed.append(d)
--> 410 da.compute(*delayed)
    412 # copy unchanged variables and top-level metadata
    413 with zarr.open_group(output) as output_zarr:
    414
    415     # copy top-level attributes

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/base.py:599, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
    596     keys.append(x.__dask_keys__())
    597     postcomputes.append(x.__dask_postcompute__())
--> 599 results = schedule(dsk, keys, **kwargs)
    600 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/threaded.py:89, in get(dsk, keys, cache, num_workers, pool, **kwargs)
     86     elif isinstance(pool, multiprocessing.pool.Pool):
     87         pool = MultiprocessingPoolExecutor(pool)
---> 89 results = get_async(
     90     pool.submit,
     91     pool._max_workers,
     92     dsk,
     93     keys,
     94     cache=cache,
     95     get_id=_thread_get_id,
     96     pack_exception=pack_exception,
     97     **kwargs,
     98 )
    100 # Cleanup pools associated to dead threads
    101 with pools_lock:

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/local.py:511, in get_async(submit, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, chunksize, **kwargs)
    509         _execute_task(task, data)  # Re-execute locally
    510     else:
--> 511         raise_exception(exc, tb)
    512 res, worker_id = loads(res_info)
    513 state["cache"][key] = res

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/local.py:319, in reraise(exc, tb)
    317 if exc.__traceback__ is not tb:
    318     raise exc.with_traceback(tb)
--> 319 raise exc

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/local.py:224, in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
    222 try:
    223     task, data = loads(task_info)
--> 224     result = _execute_task(task, data)
    225     id = get_id()
    226     result = dumps((result, id))

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/core.py:113, in _execute_task(arg, cache, dsk)
     83 """Do the actual work of collecting data and executing a function
     84
     85 Examples
   (...)
    110 'foo'
    111 """
    112 if isinstance(arg, list):
--> 113     return [_execute_task(a, cache) for a in arg]
    114 elif istask(arg):
    115     func, args = arg[0], arg[1:]

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/core.py:113, in <listcomp>(.0)
     83 """Do the actual work of collecting data and executing a function
     84
     85 Examples
   (...)
    110 'foo'
    111 """
    112 if isinstance(arg, list):
--> 113     return [_execute_task(a, cache) for a in arg]
    114 elif istask(arg):
    115     func, args = arg[0], arg[1:]

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/core.py:119, in _execute_task(arg, cache, dsk)
    115     func, args = arg[0], arg[1:]
    116     # Note: Don't assign the subtask results to a variable. numpy detects
    117     # temporaries by their reference count and can execute certain
    118     # operations in-place.
--> 119     return func(*(_execute_task(a, cache) for a in args))
    120 elif not ishashable(arg):
    121     return arg

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/array/core.py:4388, in store_chunk(x, out, index, lock, return_stored)
   4385 def store_chunk(
   4386     x: ArrayLike, out: ArrayLike, index: slice, lock: Any, return_stored: bool
   4387 ):
-> 4388     return load_store_chunk(x, out, index, lock, return_stored, False)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/dask/array/core.py:4370, in load_store_chunk(x, out, index, lock, return_stored, load_stored)
   4368 if x is not None:
   4369     if is_arraylike(x):
-> 4370         out[index] = x
   4371     else:
   4372         out[index] = np.asanyarray(x)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/core.py:1373, in Array.__setitem__(self, selection, value)
   1371     self.vindex[selection] = value
   1372 else:
-> 1373     self.set_basic_selection(pure_selection, value, fields=fields)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/core.py:1468, in Array.set_basic_selection(self, selection, value, fields)
   1466     return self._set_basic_selection_zd(selection, value, fields=fields)
   1467 else:
-> 1468     return self._set_basic_selection_nd(selection, value, fields=fields)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/core.py:1770, in Array._set_basic_selection_nd(self, selection, value, fields)
   1766 def _set_basic_selection_nd(self, selection, value, fields=None):
   1767     # implementation of __setitem__ for array with at least one dimension
   1768
   1769     # setup indexer
-> 1770     indexer = BasicIndexer(selection, self)
   1772     self._set_selection(indexer, value, fields=fields)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/indexing.py:347, in BasicIndexer.__init__(self, selection, array)
    344     dim_indexer = IntDimIndexer(dim_sel, dim_len, dim_chunk_len)
    346 elif is_slice(dim_sel):
--> 347     dim_indexer = SliceDimIndexer(dim_sel, dim_len, dim_chunk_len)
    349 else:
    350     raise IndexError('unsupported selection item for basic indexing; '
    351                      'expected integer or slice, got {!r}'
    352                      .format(type(dim_sel)))

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/indexing.py:176, in SliceDimIndexer.__init__(self, dim_sel, dim_len, dim_chunk_len)
    174 self.dim_chunk_len = dim_chunk_len
    175 self.nitems = max(0, ceildiv((self.stop - self.start), self.step))
--> 176 self.nchunks = ceildiv(self.dim_len, self.dim_chunk_len)

File ~/opt/anaconda3/envs/multiomics/lib/python3.9/site-packages/zarr/indexing.py:160, in ceildiv(a, b)
    159 def ceildiv(a, b):
--> 160     return math.ceil(a / b)

ZeroDivisionError: division by zero

Do I need to specify more input arguments? Any help reading the clinvar vcf file would be much appreciated. Thanks!!

benjeffery commented 1 year ago

Thanks for the great bug report @sofroniewn! I've managed to recreate it locally, and am looking for a workaround or fix.

benjeffery commented 1 year ago

Ah! This VCF has no samples in! What were you planning to do with it in sgkit?

I'll write up an issue for sgkit to deal with this situation more gracefully.

benjeffery commented 1 year ago

There is now a PR to fix this behaviour in https://github.com/pystatgen/sgkit/pull/1069

For now, you should be able to work around this by using vcf_to_zarr("clinvar.vcf.gz", "clinvar.zarr", regions=["1"])

sofroniewn commented 1 year ago

I've managed to recreate it locally, and am looking for a workaround or fix.

Ok great!

For now, you should be able to work around this by using vcf_to_zarr("clinvar.vcf.gz", "clinvar.zarr", regions=["1"])

This actually still seems to give me the same error.

Ah! This VCF has no samples in! What were you planning to do with it in sgkit?

Huh - ok, I am very new to VCF files (this is the first one I've ever tried to load!) so I'm not even sure what that really means. It seems to have variants inside it.

As to my goals, I have a BAM file specifying some genome intervals, and a fasta file for the hg38 reference. My goal is to take an interval in the BAM file, find which clinvar SNP variants are inside that region and then for each one substitute that SNP into the dna sequence from the reference to generate an alternative sequence.

Right now I was actually able to use pysam and vcf = pysam.VariantFile(vcf_file, 'r') to read in the file and can go from there, but I think I'd prefer to use sgkit if possible as I think I will prefer the more python API and metadata handling.

Thanks for your prompt bug-fix. I can test again in the next release. Feel free to close this issue when you want.

benjeffery commented 1 year ago

Ah, sorry, my bad! Try this workaround, that bypasses the parallel loading code:

from sgkit.io.vcf.vcf_reader import vcf_to_zarr_sequential
vcf_to_zarr_sequential("clinvar.vcf.gz", "clinvar.zarr")

You should then be able to use the arrays of positions and alleles to get your analysis done:

>>> ds = sgkit.load_dataset("clinvar.zarr")
>>> ds
<xarray.Dataset>
Dimensions:           (contigs: 29, filters: 1, samples: 0, variants: 2174000,
                       alleles: 4)
Dimensions without coordinates: contigs, filters, samples, variants, alleles
Data variables:
    contig_id         (contigs) <U14 dask.array<chunksize=(29,), meta=np.ndarray>
    filter_id         (filters) object dask.array<chunksize=(1,), meta=np.ndarray>
    sample_id         (samples) float64 dask.array<chunksize=(0,), meta=np.ndarray>
    variant_allele    (variants, alleles) object dask.array<chunksize=(10000, 4), meta=np.ndarray>
    variant_contig    (variants) int8 dask.array<chunksize=(10000,), meta=np.ndarray>
    variant_filter    (variants, filters) bool dask.array<chunksize=(10000, 1), meta=np.ndarray>
    variant_id        (variants) object dask.array<chunksize=(10000,), meta=np.ndarray>
    variant_id_mask   (variants) bool dask.array<chunksize=(10000,), meta=np.ndarray>
    variant_position  (variants) int32 dask.array<chunksize=(10000,), meta=np.ndarray>
    variant_quality   (variants) float32 dask.array<chunksize=(10000,), meta=np.ndarray>
Attributes:
    contigs:               ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'...
    filters:               ['PASS']
    max_alt_alleles_seen:  1
    source:                sgkit-0.5.1.dev153+g8179307.d20230301
    vcf_header:            ##fileformat=VCFv4.1\n##FILTER=<ID=PASS,Descriptio...
    vcf_zarr_version:      0.2
>>> ds.variant_position.values
array([ 69134,  69581,  69682, ..., 274366, 275068,  83614], dtype=int32)
tomwhite commented 1 year ago

Closing, as this should be fixed by #1069. @sofroniewn feel free to open a new issue if you have more questions.