quinlan-lab / vcf2db

create a gemini-compatible database from a VCF
MIT License
55 stars 13 forks source link

unicode not supported? #18

Closed davemcg closed 7 years ago

davemcg commented 7 years ago
40000 variant_impacts:65121 effects time: 13.9  chunk time:26.2 375.24 variants/second
Traceback (most recent call last):
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 808, in <module>
    VCFDB(a.VCF, a.db, a.ped, black_list=a.info_exclude, expand=a.expand, blobber=main_blobber)
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 214, in __init__
    self.load()
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 281, in load
    self._load(self.vcf, create=False, start=i+1)
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 265, in _load
    self.insert(variants, expanded, keys, i)
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 300, in insert
    v in variants)
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 743, in gene_info
    v = encode(d.get(k))
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 778, in encode
    v = from_bytes(v)
  File "/home/mcgaugheyd/git/vcf2db/vcf2db.py", line 73, in from_bytes
    return s.decode(ENC)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 60: ordinal not in range(128)

Had a custom field from with ClinVar disease names. Lots of weird characters in there.

Tried to force unicode on vcf2db.py by adding this:

#http://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
reload(sys)  
sys.setdefaultencoding('utf8')

Which gave a more informative(?) error.

40000 variant_impacts:65121 effects time: 24.6  chunk time:43.8 222.53 variants/second
bad record: {u'clinvar_sig': None, 'gt_phases': <read-only buffer for 0x7f8498e68960, size -1, offset 0 at 0x7f8498e659f0>, u'cse-hiseq': None, u'af_exac_nfe': -1.0, u'OLD_VARIANT': None, u'an_exac_fin': -1.0, 'variant_id': 43647, u'an_exac_amr': -1.0, 'alt': u'A', u'pLI': None, 'aa_change': '', 'impact': u'3_prime_UTR_variant', u'af_exac_sas': -1.0, u'inbreedingcoeff': -0.021299999207258224, u'an_exac_oth': -1.0, 'codon_change': '', 'gt_types': <read-only buffer for 0x7f8498e65970, size -1, offset 0 at 0x7f8498e5fef0>, 'is_lof': False, 'gts': 'S\xaf\x01\x0cT/T\x00\xfe\x04\x00v\x04\x00\x08A\x00T\x05\x04\xfeh\x00\x05h', u'an_exac_sas': -1.0, u'phenotypes': u'', 'is_exonic': True, 'exon': u'6/6', 'chrom': u'1', u'ac_exac_fin': None, u'dp': 2236, u'readposranksum': 0.07500000298023224, u'fitcons_float': 0.11389999836683273, u'aaf_1kg_amr_float': 0.0, u'rmsk': None, u'encode_consensus_k562': 'unknown', 'num_het': 2, u'old_variant': 'None', 'gt_depths': <read-only buffer for 0x7f8499040a40, size -1, offset 0 at 0x7f8498e65a30>, u'aaf_1kg_all_float': 0.0017970999469980597, u'qd': 14.050000190734863, 'effect_severity': 'LOW', u'maxentscan': u'', u'gene_eyediseaseclass': 'None', u'encode_consensus_helas3': 'R', u'max_aaf_all': -1.0, u'set': 'variant', u'mleaf': 0.023000000044703484, 'vcf_id': None, u'af_exac_all': -1.0, u'ac_exac_amr': None, u'an_exac_eas': -1.0, u'tfbs': None, u'ac_exac_oth': None, u'gene_pheno': u'1', u'mqranksum': 0.7649999856948853, u'aaf_1kg_eas_float': 0.0, u'canonical': u'', u'ExcessHet': 3.0102999210357666, 'gt_ref_depths': <read-only buffer for 0x7f8499040760, size -1, offset 0 at 0x7f8498e65a70>, u'exome_chip': None, 'call_rate': 1.0, u'encode_consensus_gm12878': 'PF', 'polyphen_score': None, u'n_syn': None, 'sift_pred': u'', u'ClippingRankSum': 0.37599998712539673, u'stam_mean': None, u'lof_z': None, u'an_exac_all': -1.0, u'Gene_EyeDiseaseClass': None, u'af_exac_afr': -1.0, u'af_exac_amr': -1.0, u'old_multiallelic': None, u'CSQ': None, 'sub_type': 'tv', u'pli': 'None', u'excesshet': 3.0102999210357666, u'adj_exp_lof': None, u'mis_z': None, u'stam_names': None, u'baseqranksum': 0.7250000238418579, 'so': u'3_prime_UTR_variant', 'filter': None, u'pheno': u'', u'MQ': 59.939998626708984, u'aaf_esp_ea': -1.0, u'n_lof': None, u'hapmap1': 26.881099700927734, u'aaf_esp_all': -1.0, u'af_exac_eas': -1.0, 'sift_score': None, u'ac_exac_nfe': None, u'gerp_elements': 0.0, 'top_consequence': u'3_prime_UTR_variant', 'num_hom_ref': 42, 'is_splicing': False, u'sor': 0.4880000054836273, u'cosmic_ids': None, 'gt_quals': <read-only buffer for 0x7f8498fdfc70, size -1, offset 0 at 0x7f8498e65b30>, u'ac_exac_all': None, u'syn_z': None, u'MLEAC': 2, u'MLEAF': 0.023000000044703484, u'encode_consensus_hepg2': 'unknown', 'aa_length': u'-/258', u'FS': 2.3420000076293945, u'clinvar_diseases': 'Dejerine-Sottas_disease|Charcot-Marie-Tooth_disease|Roussy-L\xc3\xa9vy_syndrome|Congenital_hypomyelinating_neuropathy|Charcot-Marie-Tooth_disease\\x2c_type_1b\\x2c_with_focally_folded_myelin_sheaths|Dejerine-Sottas_syndrome\\x2c_autosomal_dominant|Charcot-Marie-Tooth_disease_type_2I|Dejerine-sottas_syndrome\\x2c_sporadic|Charcot-Marie-Tooth_disease_type_2J|Charcot-Marie-Tooth_disease_dominant_intermediate_3|Charcot-Marie-Tooth_disease_type_1B|Charcot-Marie-Tooth_disease\\x2c_type_I|Multiple_congenital_anomalies|Neuropathy\\x2c_congenital_hypomyelinating\\x2c_autosomal_dominant', 'polyphen_pred': u'', 'start': 161274617, u'clin_sig': u'', u'ac_exac_eas': None, u'OLD_MULTIALLELIC': None, 'gene': u'MPZ', u'an_exac_afr': -1.0, 'type': 'snp', u'mleac': 2, u'in_esp': None, u'MQRankSum': 0.7649999856948853, 'impact_severity': 'LOW', u'pNull': None, 'qual': 2079.199951171875, u'hgvsp': u'', u'domains': u'', u'precessive': 'None', u'aaf_1kg_sas_float': 0.0, 'aaf': 0.022727272727272728, u'encode_consensus_huvec': 'unknown', 'gt_alt_depths': <read-only buffer for 0x7f8498e65af0, size -1, offset 0 at 0x7f8498e65ab0>, u'mq': 59.939998626708984, u'in_1kg': None, 'num_hom_alt': 0, u'aaf_esp_aa': -1.0, u'clinvar_pathogenic': None, u'ac': 2, u'BaseQRankSum': 0.7250000238418579, u'af': 0.023000000044703484, u'an': 88, u'encode_consensus_h1hesc': 'unknown', u'pRecessive': None, u'grantham': u'', u'pubmed': u'', u'DP': 2236, u'ac_exac_afr': None, 'end': 161274618, 'ref': u'T', u'gwas_pubmed_trait': None, u'cpg_island': None, u'n_mis': None, u'af_exac_oth': -1.0, u'SOR': 0.4880000054836273, u'clippingranksum': 0.37599998712539673, u'adj_exp_mis': None, u'csq': None, u'AC': 2, u'fs': 2.3420000076293945, 'is_coding': False, u'AF': 0.023000000044703484, u'ClinVar_Diseases': 'Dejerine-Sottas_disease|Charcot-Marie-Tooth_disease|Roussy-L\xc3\xa9vy_syndrome|Congenital_hypomyelinating_neuropathy|Charcot-Marie-Tooth_disease\\x2c_type_1b\\x2c_with_focally_folded_myelin_sheaths|Dejerine-Sottas_syndrome\\x2c_autosomal_dominant|Charcot-Marie-Tooth_disease_type_2I|Dejerine-sottas_syndrome\\x2c_sporadic|Charcot-Marie-Tooth_disease_type_2J|Charcot-Marie-Tooth_disease_dominant_intermediate_3|Charcot-Marie-Tooth_disease_type_1B|Charcot-Marie-Tooth_disease\\x2c_type_I|Multiple_congenital_anomalies|Neuropathy\\x2c_congenital_hypomyelinating\\x2c_autosomal_dominant', u'AN': 88, u'hapmap2': 173.90139770507812, u'dgv': None, 'biotype': u'protein_coding', 'transcript': u'ENST00000360451', u'aaf_1kg_eur_float': 0.00800000037997961, u'rs_ids': 'rs71639057', u'ReadPosRankSum': 0.07500000298023224, u'adj_exp_syn': None, u'InbreedingCoeff': -0.021299999207258224, u'in_exac': None, u'ac_exac_sas': None, u'pnull': 'None', u'hgvsc': u'ENST00000360451.6:c.*1048A>T', u'EXOME_CHIP': None, u'aaf_1kg_afr_float': 0.0007999999797903001, u'QD': 14.050000190734863, 'impact_so': u'3_prime_UTR_variant', u'an_exac_nfe': -1.0}
davemcg commented 7 years ago

Using iconv on that bed file to remove non-Ascii characters and re-annotating with vcfanno 'fixed' the problem.

brentp commented 7 years ago

can you tell me the out put of:

python -c "import locale;print locale.getpreferredencoding()"
brentp commented 7 years ago

can you check the latest commit? I pushed a change that I believe will fix this. You default encoding must be ascii.

davemcg commented 7 years ago
$ python -c "import locale;print locale.getpreferredencoding()"
ANSI_X3.4-1968
davemcg commented 7 years ago

I'll check that tomorrow. Thanks Brent.

davemcg commented 7 years ago

IndentationError: unindent does not match any outer indentation level

davemcg commented 7 years ago

Once I fixed the tab/space mixup it works

brentp commented 7 years ago

doh! thanks.