tidypyverse / tidypandas

A grammar of data manipulation for pandas inspired by tidyverse
https://tidypyverse.github.io/tidypandas/
MIT License
93 stars 7 forks source link

Error in summarise if the return value of function is a singleton of not a numeric or string type. #37

Closed grahitr closed 1 year ago

grahitr commented 1 year ago

>>> df = pd.DataFrame({"snapshot_date": pd.Series(data=["2020-01-01", "2020-02-01", "2020-03-01"]
                                                                       , dtype="datetime64[ms]")})
>>> df.tp.summarise({"min_snapshot_date": (min, "snapshot_date")
                , "max_snapshot_date": (max, "snapshot_date")})
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [7], line 2
      1 df = pd.DataFrame({"snapshot_date": pd.Series(data=["2020-01-01", "2020-02-01", "2020-03-01"], dtype="datetime64[ms]")})
----> 2 df.tp.summarise({"min_snapshot_date": (min, "snapshot_date")
      3                 , "max_snapshot_date": (max, "snapshot_date")})

File ~/codes/tidypandas/src/tidypandas/tidy_accessor.py:180, in tp.summarise(self, dictionary, func, column_names, predicate, prefix, by, **kwargs)
    170 def summarise(self
    171               , dictionary = None
    172               , func = None
   (...)
    177               , **kwargs
    178               ):
    179     tf = tidyframe(self._obj, copy = False, check = False)
--> 180     return tf.summarise(dictionary = dictionary
    181                         , func = func
    182                         , column_names = column_names
    183                         , predicate = predicate
    184                         , prefix = prefix
    185                         , by = by
    186                         , **kwargs
    187                         ).to_pandas(copy = False)

File ~/codes/tidypandas/src/tidypandas/tidyframe_class.py:2675, in tidyframe.summarise(self, dictionary, func, column_names, predicate, prefix, by, **kwargs)
   2673 if by is None:
   2674     if dictionary is not None:
-> 2675         res = self._summarise(dictionary, **kwargs)
   2676     else:
   2677         res = (self.__data[column_names]
   2678                    .agg(func, **kwargs)
   2679                    .to_frame()
   2680                    .T
   2681                    .rename(columns = dict(zip(column_names, prefixed_names)))
   2682                    )

File ~/codes/tidypandas/src/tidypandas/tidyframe_class.py:2414, in tidyframe._summarise(self, dictionary, **kwargs)
   2411     else:
   2412         rhs_val = rhs[0](*[self.__data[acol] for acol in cols])
-> 2414     _validate_rhs_val(akey, rhs_val)
   2415     summary_dict[akey] = rhs_val
   2416 else:
   2417     # string case

File ~/codes/tidypandas/src/tidypandas/tidyframe_class.py:2340, in tidyframe._summarise.<locals>._validate_rhs_val(akey, rhs_val)
   2338 if not pd.isna(rhs_val):
   2339     if not np.isscalar(rhs_val):
-> 2340         if not (len(rhs_val) == 1 and np.isscalar(rhs_val[0])):
   2341             raise Exception((f"Summarised value for key {akey} does not"
   2342                              " turn out to be a scalar or cannot be "
   2343                              "converted to a scalar")
   2344                             )
   2345 return None

TypeError: object of type 'Timestamp' has no len()
grahitr commented 1 year ago

np.isscalar works in intended way only on the numeric, string and buffer objects, https://numpy.org/doc/stable/reference/generated/numpy.isscalar.html.

IMO, it should be replaced with a custom function that could check if is_singleton.