Zero division error in rule `analyze_variant_counts`

This error is similar to issue #140. It was resolved when I ran a limited set of samples on branch spike-in-tests. Now that I'm trying to run the pipeline with all selection samples (branch finalized-selections), I'm getting a similar error again.

I initially thought it was re-introduced when I updated the pipeline to 2.4.0 (incorporates antibody count thresholds). But the error persisted when I reverted the pipeline, so it probably has to do with the full set of samples I'm running instead.

@jbloom commit for recreating this error is here. Let me know if any additional information would be helpful!

Full error message:

---------------------------------------------------------------------------
Exception encountered at "In [24]":
---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
Cell In[24], line 15
      1 variant_counts = (
      2     variants.variant_count_df[
      3         ["library", "sample", "target", "barcode", "count", "aa_substitutions"]
      4     ]
      5     .merge(
      6         barcode_runs.drop(
      7             columns=[
      8                 "fastq_R1",
      9                 "notes",
     10                 "antibody_concentration",
     11                 "exclude_after_counts",
     12             ]
     13         )
     14     )
---> 15     .assign(
     16         percent=lambda x: 100
     17         * x["count"]
     18         / x.groupby(["library_sample", "target"])["count"].transform("sum")
     19     )
     20     .sort_values("percent", ascending=False)
     21 )

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/frame.py:4889, in DataFrame.assign(self, **kwargs)
   4886 data = self.copy()
   4888 for k, v in kwargs.items():
-> 4889     data[k] = com.apply_if_callable(v, data)
   4890 return data

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/common.py:374, in apply_if_callable(maybe_callable, obj, **kwargs)
    363 """
    364 Evaluate possibly callable input using obj and kwargs if it is callable,
    365 otherwise return as it is.
   (...)
    371 **kwargs
    372 """
    373 if callable(maybe_callable):
--> 374     return maybe_callable(obj, **kwargs)
    376 return maybe_callable

Cell In[24], line 16, in <lambda>(x)
      1 variant_counts = (
      2     variants.variant_count_df[
      3         ["library", "sample", "target", "barcode", "count", "aa_substitutions"]
      4     ]
      5     .merge(
      6         barcode_runs.drop(
      7             columns=[
      8                 "fastq_R1",
      9                 "notes",
     10                 "antibody_concentration",
     11                 "exclude_after_counts",
     12             ]
     13         )
     14     )
     15     .assign(
---> 16         percent=lambda x: 100
     17         * x["count"]
     18         / x.groupby(["library_sample", "target"])["count"].transform("sum")
     19     )
     20     .sort_values("percent", ascending=False)
     21 )

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/ops/common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     68             return NotImplemented
     70 other = item_from_zerodim(other)
---> 72 return method(self, other)

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/arraylike.py:126, in OpsMixin.__truediv__(self, other)
    124 @unpack_zerodim_and_defer("__truediv__")
    125 def __truediv__(self, other):
--> 126     return self._arith_method(other, operator.truediv)

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/series.py:6259, in Series._arith_method(self, other, op)
   6257 def _arith_method(self, other, op):
   6258     self, other = ops.align_method_SERIES(self, other)
-> 6259     return base.IndexOpsMixin._arith_method(self, other, op)

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/base.py:1325, in IndexOpsMixin._arith_method(self, other, op)
   1322 rvalues = ensure_wrapped_if_datetimelike(rvalues)
   1324 with np.errstate(all="ignore"):
-> 1325     result = ops.arithmetic_op(lvalues, rvalues, op)
   1327 return self._construct_result(result, name=res_name)

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:226, in arithmetic_op(left, right, op)
    222     _bool_arith_check(op, left, right)
    224     # error: Argument 1 to "_na_arithmetic_op" has incompatible type
    225     # "Union[ExtensionArray, ndarray[Any, Any]]"; expected "ndarray[Any, Any]"
--> 226     res_values = _na_arithmetic_op(left, right, op)  # type: ignore[arg-type]
    228 return res_values

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:165, in _na_arithmetic_op(left, right, op, is_cmp)
    162     func = partial(expressions.evaluate, op)
    164 try:
--> 165     result = func(left, right)
    166 except TypeError:
    167     if not is_cmp and (is_object_dtype(left.dtype) or is_object_dtype(right)):
    168         # For object dtype, fallback to a masked operation (only operating
    169         #  on the non-missing values)
    170         # Don't do this for comparisons, as that will handle complex numbers
    171         #  incorrectly, see GH#32047

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/computation/expressions.py:241, in evaluate(op, a, b, use_numexpr)
    238 if op_str is not None:
    239     if use_numexpr:
    240         # error: "None" not callable
--> 241         return _evaluate(op, op_str, a, b)  # type: ignore[misc]
    242 return _evaluate_standard(op, op_str, a, b)

File /fh/fast/bloom_j/computational_notebooks/fwelsh/2022/flu_h3_hk19_dms/.snakemake/conda/d40c1eb554a2599b4d0a1fa4a36f5a46_/lib/python3.11/site-packages/pandas/core/computation/expressions.py:70, in _evaluate_standard(op, op_str, a, b)
     68 if _TEST_MODE:
     69     _store_test_result(False)
---> 70 return op(a, b)

ZeroDivisionError: division by zero

dms-vep / dms-vep-pipeline

Zero division error in rule `analyze_variant_counts` #146