Converting example vcf to HDF5

batelz commented 1 year ago

Hi, First of all, thank you for your detailed and easy to follow instructions in the notebook.

Regarding the vcf to HDF5 conversion function: To process the data obtained from Dropbox, it is necessary to unzip the vcf.gz files prior to executing the code mentioned in the notebook.

Second, I get this error after unziping using gunzip *.gz: UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

This is the full error print:

Print downsampling to 1240K...
Finished BCF tools filtering.
Converting to HDF5...
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<timed exec> in <module>

~/anaconda3/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
    114 
    115     print("Converting to HDF5...")
--> 116     allel.vcf_to_hdf5(input=path_vcf, output=path_h5, 
    117                       fields = ['variants/*', 'calldata/*', "samples"],
    118                       types = {"samples":"S60", "calldata/GT":np.int8,

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in vcf_to_hdf5(input, output, group, compression, compression_opts, shuffle, overwrite, vlen, fields, exclude_fields, rename_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length, chunk_width, log)
    691 
    692     # setup chunk iterator
--> 693     fields, samples, headers, it = iter_vcf_chunks(
    694         input, fields=fields, exclude_fields=exclude_fields, types=types,
    695         numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in iter_vcf_chunks(input, fields, exclude_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length)
   1136 
   1137     # setup iterator
-> 1138     fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
   1139 
   1140     # setup transformers

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number, chunk_length, fills, region, samples)
   1634 
   1635     # read VCF headers
-> 1636     headers = _read_vcf_headers(stream)
   1637 
   1638     # setup samples

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in _read_vcf_headers(stream)
   1709     # read first header line
   1710     header = stream.readline()
-> 1711     header = str(header, 'utf8')
   1712 
   1713     while header and header[0] == '#':

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

batelz commented 1 year ago

And actually, even when I run the code without unzipping, like so:

%%time
ch = 22

base_path = f"/home/batel/ancIBD/"
vcf_to_1240K_hdf(in_vcf_path = f"./data/vcf.raw/example_hazelton_chr{ch}.vcf.gz",
                 path_vcf = f"./data/vcf.1240k/example_hazelton_chr{ch}.vcf",
                 path_h5 = f"./data/hdf5/example_hazelton_chr{ch}.h5",
                 marker_path = f"./data/filters/snps_bcftools_ch{ch}.csv",
                 map_path = f"./data/v51.1_1240k.snp", 
                 af_path = f"./data/afs/v51.1_1240k_AF_ch{ch}.tsv",
                 col_sample_af = "", 
                 buffer_size=20000, chunk_width=8, chunk_length=20000,
                 ch=ch)

I still get the same error...

Print downsampling to 1240K...
Finished BCF tools filtering.
Converting to HDF5...
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<timed exec> in <module>

~/anaconda3/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
    114 
    115     print("Converting to HDF5...")
--> 116     allel.vcf_to_hdf5(input=path_vcf, output=path_h5, 
    117                       fields = ['variants/*', 'calldata/*', "samples"],
    118                       types = {"samples":"S60", "calldata/GT":np.int8,

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in vcf_to_hdf5(input, output, group, compression, compression_opts, shuffle, overwrite, vlen, fields, exclude_fields, rename_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length, chunk_width, log)
    691 
    692     # setup chunk iterator
--> 693     fields, samples, headers, it = iter_vcf_chunks(
    694         input, fields=fields, exclude_fields=exclude_fields, types=types,
    695         numbers=numbers, alt_number=alt_number, buffer_size=buffer_size,

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in iter_vcf_chunks(input, fields, exclude_fields, types, numbers, alt_number, fills, region, tabix, samples, transformers, buffer_size, chunk_length)
   1136 
   1137     # setup iterator
-> 1138     fields, samples, headers, it = _iter_vcf_stream(stream, **kwds)
   1139 
   1140     # setup transformers

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in _iter_vcf_stream(stream, fields, exclude_fields, types, numbers, alt_number, chunk_length, fills, region, samples)
   1634 
   1635     # read VCF headers
-> 1636     headers = _read_vcf_headers(stream)
   1637 
   1638     # setup samples

~/anaconda3/lib/python3.9/site-packages/allel/io/vcf_read.py in _read_vcf_headers(stream)
   1709     # read first header line
   1710     header = stream.readline()
-> 1711     header = str(header, 'utf8')
   1712 
   1713     while header and header[0] == '#':

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

Any ideas?

hringbauer commented 1 year ago

Thank you for that bug report in our Vignette!

One suspicion is that the intermediate VCF is output by bcftools as a compressed file (we usebcftools view -Oz) , but the file ending of this output file is ".vcf" and not ".vcf.gz". This issue then might lead to an input error in allel Python package that we call to create the hdf5 file from this intermediate VCF - depending on the combination of bcftoolsand the allelpackage you have installed.

To check whether this is the issue, could you try to update the function call of vcf_to_1240K_hdfto

path_vcf = f"./data/vcf.1240k/example_hazelton_chr{ch}.vcf.gz"

We would be grateful if you report back if that fixes the bug - so that we can update our default Vignette as well.

batelz commented 1 year ago

Thank you for your fast reply. This indeed solved the issue.

Another minor note - map_path = f"./data/v51.1_1240k.snp" is actually located in "./data/afs/v51.1_1240k.snp"

batelz commented 1 year ago

Sorry, I'm still having trouble when I try transforming my data.

My data looks like this:

chr22 15203963 22:15203963:C:T C T . . RAF=0.0326359;AF=0.0424611;INFO=1 GT:DS:GP 0|0:0.085:0.916,0.081,0.003 chr22 15203968 22:15203968:G:A G A . . RAF=0.000312305;AF=0.000294199;INFO=1 GT:DS:GP 0|0:0.001:0.999,0.001,0 chr22 15203978 22:15203978:T:C T C . . RAF=0.000312305;AF=0.00044355;INFO=1 GT:DS:GP 0|0:0.001:0.999,0.001,0 chr22 15203998 22:15203998:C:A C A . . RAF=0.0791693;AF=0.0667859;INFO=1 GT:DS:GP 0|0:0.134:0.871,0.124,0.005

It has 1 sample , and GT:DS:GP format. It was outputed by the standard GLIMPSE ligate command: ./ligate/bin/GLIMPSE2_ligate --input ${LST} --output chr22_ligated.vcf.gz Where LST is a list of subsets of vcf chunks.

When I run the same command, just using my vcf,

ch = 22
base_path = f"/home/batel/ancIBD/"
vcf_to_1240K_hdf(in_vcf_path = "/home/batel/glimpse/Levant_BA/GLIMPSE_ligate/chr22_ligated.vcf.gz",
                 path_vcf = f"./data/vcf.1240k/chr22_ligated.vcf.gz",
                 path_h5 = f"./data/hdf5/chr22_ligated.h5",
                 marker_path = f"./data/filters/snps_bcftools_ch{ch}.csv",
                 map_path = f"./data/afs/v51.1_1240k.snp", 
                 af_path = f"./data/afs/v51.1_1240k_AF_ch{ch}.tsv",
                 col_sample_af = "", 
                 buffer_size=20000, chunk_width=8, chunk_length=20000,
                 ch=ch)

I get:

Print downsampling to 1240K...
Finished BCF tools filtering.
Converting to HDF5...
Finished conversion to hdf5!
Merging in LD Map..
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<timed exec> in <module>

~/anaconda3/lib/python3.9/site-packages/ancIBD/IO/prepare_h5.py in vcf_to_1240K_hdf(in_vcf_path, path_vcf, path_h5, marker_path, map_path, af_path, col_sample_af, chunk_length, chunk_width, buffer_size, ch)
    125     print("Merging in LD Map..")
    126     if len(map_path)>0:
--> 127         merge_in_ld_map(path_h5=path_h5, 
    128                     path_snp1240k=map_path,
    129                     chs=[ch])

~/anaconda3/lib/python3.9/site-packages/ancIBD/IO/h5_modify.py in merge_in_ld_map(path_h5, path_snp1240k, chs, write_mode)
    115     chs: Which Chromosomes to merge in HDF5 [list].
    116     write_mode: Which mode to use on hdf5. a: New field. r+: Change Field"""
--> 117     with h5py.File(path_h5, "r") as f:
    118         print("Lifting LD Map from eigenstrat to HDF5...")
    119         print("Loaded %i variants." % np.shape(f["calldata/GT"])[0])

~/anaconda3/lib/python3.9/site-packages/h5py/_hl/files.py in __init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, **kwds)
    531                                  fs_persist=fs_persist, fs_threshold=fs_threshold,
    532                                  fs_page_size=fs_page_size)
--> 533                 fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    534 
    535             if isinstance(libver, tuple):

~/anaconda3/lib/python3.9/site-packages/h5py/_hl/files.py in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
    224         if swmr and swmr_support:
    225             flags |= h5f.ACC_SWMR_READ
--> 226         fid = h5f.open(name, flags, fapl=fapl)
    227     elif mode == 'r+':
    228         fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5f.pyx in h5py.h5f.open()

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = './data/hdf5/chr22_ligated.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

hringbauer commented 1 year ago

As it says, it cannot find ./data/hdf5/chr22_ligated.h5 at this relative path - there seems to be no such file there.

You should check whether this relative path is correct or substitute it with the absolute path. Notice that the working directory of the vignette notebook is set previously in the vignette notebook via os.chdir(path)

hringbauer commented 1 year ago

Also, IBD is a pairwise property, so you will need at least two samples in your VCF and then HDF5 to run ancIBD. :)

hringbauer / ancIBD

Converting example vcf to HDF5 #5