Closed pyrevo closed 11 months ago
ERROR in get_target_variants.py:
Traceback (most recent call last):
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 158, in <module>
main()
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 153, in main
var_collect = VariantQCCollection(bed_f, vcf_f)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 112, in __init__
self.vcf = VCF(vcf_filename)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 62, in __init__
self.read_vcf(filename)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 87, in read_vcf
format_columns = self.data.FORMAT[max_len_idx].split(":")
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 945, in __getitem__
key = com.apply_if_callable(key, self)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/common.py", line 358, in apply_if_callable
return maybe_callable(obj, **kwargs)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 2404, in idxmax
i = self.argmax(axis, skipna, *args, **kwargs)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/base.py", line 646, in argmax
nv.validate_minmax_axis(axis)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/compat/numpy/function.py", line 408, in validate_minmax_axis
if axis >= ndim or (axis < 0 and ndim + axis < 0):
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/common.py", line 70, in new_method
return method(self, other)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/arraylike.py", line 60, in __ge__
return self._cmp_method(other, operator.ge)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 5623, in _cmp_method
res_values = ops.comparison_op(lvalues, rvalues, op)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/array_ops.py", line 283, in comparison_op
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/array_ops.py", line 73, in comp_method_OBJECT_ARRAY
result = libops.scalar_compare(x.ravel(), y, op)
File "pandas/_libs/ops.pyx", line 107, in pandas._libs.ops.scalar_compare
TypeError: '>=' not supported between instances of 'str' and 'int'
Solution:
#max_len_idx = self.data.FORMAT.str.len().idxmax
#format_columns = self.data.FORMAT[max_len_idx].split(":")
format_columns=self.data.FORMAT[0].split(":")
Comment: It should work since if vcf is not empty you should have at least the first entry. But to be tested further.
if not self.data.empty:
print(self.data)
sample_column = self.data.columns.values[-1]
max_len_idx = self.data.FORMAT.str.len().idxmax
print(max_len_idx)
format_columns = self.data.FORMAT[max_len_idx].split(":")
print(format_columns)
format_columns=self.data.FORMAT[0].split(":")
format_split = pd.DataFrame(self.data[sample_column].apply(lambda x: x.split(":")).to_list(),
columns=format_columns)
self.data = pd.concat([self.data, format_split], axis=1)
Generates:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 12878_1_S1
0 chr1 97543752 rs291593 G A 818.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=0.383;DB;DP=41;E... GT:AD:AF:DP:GQ:PL 0/1:14,24:0.632:38:99:826,0,405
1 chr1 97543764 rs291592 C T 846.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=1.583;DB;DP=40;E... GT:AD:AF:DP:GQ:PL 0/1:15,25:0.625:40:99:854,0,444
2 chr1 97544771 rs290855 T C 2030.06 DP100 AC=2;AF=1;AN=2;DB;DP=52;ExcessHet=3.0103;FS=0;... GT:AD:AF:DP:GQ:PL 1/1:0,50:1:50:99:2044,150,0
3 chr1 97547831 rs290854 G A 1895.06 DP100 AC=2;AF=1;AN=2;DB;DP=46;ExcessHet=3.0103;FS=0;... GT:AD:AF:DP:GQ:PL 1/1:0,46:1:46:99:1909,138,0
4 chr1 97771947 rs7556439 C A 2135.06 DP100 AC=2;AF=1;AN=2;DB;DP=54;ExcessHet=3.0103;FS=0;... GT:AD:AF:DP:GQ:PL 1/1:0,54:1:54:99:2149,157,0
5 chr1 97839016 rs1890138 A G 934.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=0.711;DB;DP=52;E... GT:AD:AF:DP:GQ:PL 0/1:23,27:0.54:50:99:942,0,744
6 chr1 97847874 rs72728438 T C 1042.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=-0.484;DB;DP=56;... GT:AD:AF:DP:GQ:PL 0/1:26,30:0.536:56:99:1050,0,905
7 chr1 97981242 rs2811178 T C 1150.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=0;DB;DP=53;Exces... GT:AD:AF:DP:GQ:PL 0/1:20,29:0.592:49:99:1158,0,753
8 chr1 97981243 rs2786783 G A 1150.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=0.407;DB;DP=52;E... GT:AD:AF:DP:GQ:PL 0/1:20,29:0.592:49:99:1158,0,753
9 chr1 97981395 rs1801159 T C 1066.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=-0.732;DB;DP=51;... GT:AD:AF:DP:GQ:PL 0/1:18,30:0.625:48:99:1074,0,608
10 chr1 97981421 rs1801158 C T 640.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=-0.897;DB;DP=53;... GT:AD:AF:DP:GQ:PL 0/1:29,21:0.42:50:99:648,0,1015
11 chr1 98015406 rs61789183 A T 277.64 DP100 AC=1;AF=0.5;AN=2;BaseQRankSum=0.588;DB;DP=47;E... GT:AD:AF:DP:GQ:PL 0/1:35,10:0.222:45:99:285,0,1270
12 chr1 98205973 rs72549309 G A 2237.06 PASS AC=2;AF=1;AN=2;DB;DP=56;ExcessHet=3.0103;FS=0;... GT:AD:AF:DP:GQ:PL 1/1:0,55:1:55:99:2251,165,0
<bound method Series.idxmax of 0 17
1 17
2 17
3 17
4 17
5 17
6 17
7 17
8 17
9 17
10 17
11 17
12 17
Name: FORMAT, dtype: int64>
Traceback (most recent call last):
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 162, in <module>
main()
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 157, in main
var_collect = VariantQCCollection(bed_f, vcf_f)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 116, in __init__
self.vcf = VCF(vcf_filename)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 62, in __init__
self.read_vcf(filename)
File "/home/massi/Documents/GMS/hydra_genetics/hydra_test/hydra_0.15.0/pgx/test.py", line 89, in read_vcf
format_columns = self.data.FORMAT[max_len_idx].split(":")
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 945, in __getitem__
key = com.apply_if_callable(key, self)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/common.py", line 358, in apply_if_callable
return maybe_callable(obj, **kwargs)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 2404, in idxmax
i = self.argmax(axis, skipna, *args, **kwargs)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/base.py", line 646, in argmax
nv.validate_minmax_axis(axis)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/compat/numpy/function.py", line 408, in validate_minmax_axis
if axis >= ndim or (axis < 0 and ndim + axis < 0):
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/common.py", line 70, in new_method
return method(self, other)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/arraylike.py", line 60, in __ge__
return self._cmp_method(other, operator.ge)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/series.py", line 5623, in _cmp_method
res_values = ops.comparison_op(lvalues, rvalues, op)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/array_ops.py", line 283, in comparison_op
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)
File "/opt/sw/bioinfo-tools/sources/anaconda3/envs/hydra-genetics_0.15.0/lib/python3.10/site-packages/pandas/core/ops/array_ops.py", line 73, in comp_method_OBJECT_ARRAY
result = libops.scalar_compare(x.ravel(), y, op)
File "pandas/_libs/ops.pyx", line 107, in pandas._libs.ops.scalar_compare
TypeError: '>=' not supported between instances of 'str' and 'int'
Replacing with this code:
#max_len_idx = self.data.FORMAT.str.len().idxmax
#format_columns = self.data.FORMAT[max_len_idx].split(":")
format_columns=self.data.FORMAT[0].split(":")
print(format_columns)
Generates:
['GT', 'AD', 'AF', 'DP', 'GQ', 'PL']
And possibly solves the issue.
It looks like this bugfix is working well so I will close this issue.
rule that will be converted:
Other
Please follow hydra-genetics guidelines