pandas-dev / pandas

Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
https://pandas.pydata.org
BSD 3-Clause "New" or "Revised" License
43.86k stars 18.01k forks source link

BUG: Resampler agg cannot apply on list of functions #44400

Open Mengda-Li opened 3 years ago

Mengda-Li commented 3 years ago

Reproducible Example

import pandas as pd

# loading a dataFrame as variable csv with attributes `trade Id`, `price`, `qty` like
#                            trade Id     price    qty  quoteQty  isBuyerMaker
# time                                                                        
# 2021-09-01 00:00:00.023  1391646824  47150.32  0.002     94.30          True
# 2021-09-01 00:00:00.093  1391646825  47150.33  0.002     94.30         False

r = csv.head(10).resample('1s', label='right')

# define a function vwap with some d

def vwap(x):
    print("it's vwap")
    print(x)
    p = x.price
    print("it's p")
    print(p)

    q = x.qty
    print("it's q")
    print(q)

    # print(x.price)
    return (p @ q)/q.sum()

def sum_qty(x):
    print(x)
    return x.qty.sum()

def sum_quoteQty(x):
    return x.quoteQty.sum()

# then apply a list of functions on the Resampler
r.apply({"price" : vwap, "qty": sum_qty, "quoteQty": sum_quoteQty})

Issue Description

Get an AttributeError because the attribute price cannot be located AttributeError: 'Series' object has no attribute 'price'

when execute

r.apply({"price" : vwap, "qty": sum_qty, "quoteQty": sum_quoteQty})

get a debug print from function vwap

it's vwap
2021-09-01 00:00:00.023    47150.32
2021-09-01 00:00:00.093    47150.33
2021-09-01 00:00:00.994    47150.33
2021-09-01 00:00:00.994    47150.33
2021-09-01 00:00:00.994    47152.97
2021-09-01 00:00:00.994    47153.48
Name: price, dtype: float64

Then the error message is

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_939/4117684543.py in <module>
----> 1 r.apply({"price" : vwap, "qty": sum_qty, "quoteQty": sum_quoteQty})

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/resample.py in aggregate(self, func, *args, **kwargs)
    332     def aggregate(self, func, *args, **kwargs):
    333 
--> 334         result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
    335         if result is None:
    336             how = func

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/apply.py in agg(self)
    159 
    160         if is_dict_like(arg):
--> 161             return self.agg_dict_like()
    162         elif is_list_like(arg):
    163             # we require a list, but not a 'str'

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/apply.py in agg_dict_like(self)
    433         else:
    434             # key used for column selection and output
--> 435             results = {
    436                 key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
    437             }

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/apply.py in <dictcomp>(.0)
    434             # key used for column selection and output
    435             results = {
--> 436                 key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
    437             }
    438 

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/groupby/generic.py in aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    263 
    264             try:
--> 265                 return self._python_agg_general(func, *args, **kwargs)
    266             except KeyError:
    267                 # TODO: KeyError is raised in _python_agg_general,

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in _python_agg_general(self, func, *args, **kwargs)
   1308             try:
   1309                 # if this function is invalid for this dtype, we will ignore it.
-> 1310                 result = self.grouper.agg_series(obj, f)
   1311             except TypeError:
   1312                 warnings.warn(

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/groupby/ops.py in agg_series(self, obj, func, preserve_dtype)
   1026 
   1027         else:
-> 1028             result = self._aggregate_series_fast(obj, func)
   1029 
   1030         npvalues = lib.maybe_convert_objects(result, try_float=False)

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/groupby/ops.py in _aggregate_series_fast(self, obj, func)
   1249         #  - len(self.bins) > 0
   1250         sbg = libreduction.SeriesBinGrouper(obj, func, self.bins)
-> 1251         result, _ = sbg.get_result()
   1252         return result
   1253 

/SSD/lime/conda/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction.SeriesBinGrouper.get_result()

/SSD/lime/conda/lib/python3.9/site-packages/pandas/_libs/reduction.pyx in pandas._libs.reduction._BaseGrouper._apply_to_group()

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/groupby/groupby.py in <lambda>(x)
   1294     def _python_agg_general(self, func, *args, **kwargs):
   1295         func = com.is_builtin_func(func)
-> 1296         f = lambda x: func(x, *args, **kwargs)
   1297 
   1298         # iterate through "columns" ex exclusions to populate output dict

/tmp/ipykernel_939/2003501728.py in vwap(x)
      2     print("it's vwap")
      3     print(x)
----> 4     p = x.price
      5     print("it's p")
      6     print(p)

/SSD/lime/conda/lib/python3.9/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5485         ):
   5486             return self[name]
-> 5487         return object.__getattribute__(self, name)
   5488 
   5489     def __setattr__(self, name: str, value) -> None:

AttributeError: 'Series' object has no attribute 'price'

Expected Behavior

Resampler.apply can locate attribute with one function

r.apply(vwap)

which returns

time
2021-09-01 00:00:01    47150.777407
2021-09-01 00:00:02             NaN
2021-09-01 00:00:03    47153.470000
Freq: S, dtype: float64

and the debug prints in vwap show it can locate attributes price and vwap:

it's vwap
2021-09-01 00:00:00.023    1391646824
2021-09-01 00:00:00.093    1391646825
2021-09-01 00:00:00.994    1391646826
2021-09-01 00:00:00.994    1391646827
2021-09-01 00:00:00.994    1391646828
2021-09-01 00:00:00.994    1391646829
Name: trade Id, dtype: int64
it's vwap
                           trade Id     price    qty  quoteQty  isBuyerMaker
time                                                                        
2021-09-01 00:00:00.023  1391646824  47150.32  0.002     94.30          True
2021-09-01 00:00:00.093  1391646825  47150.33  0.002     94.30         False
2021-09-01 00:00:00.994  1391646826  47150.33  0.021    990.15         False
2021-09-01 00:00:00.994  1391646827  47150.33  0.021    990.15         False
2021-09-01 00:00:00.994  1391646828  47152.97  0.002     94.30         False
2021-09-01 00:00:00.994  1391646829  47153.48  0.006    282.92         False
it's p
time
2021-09-01 00:00:00.023    47150.32
2021-09-01 00:00:00.093    47150.33
2021-09-01 00:00:00.994    47150.33
2021-09-01 00:00:00.994    47150.33
2021-09-01 00:00:00.994    47152.97
2021-09-01 00:00:00.994    47153.48
Name: price, dtype: float64
it's q
time
2021-09-01 00:00:00.023    0.002
2021-09-01 00:00:00.093    0.002
2021-09-01 00:00:00.994    0.021
2021-09-01 00:00:00.994    0.021
2021-09-01 00:00:00.994    0.002
2021-09-01 00:00:00.994    0.006
Name: qty, dtype: float64
it's vwap
Empty DataFrame
Columns: [trade Id, price, qty, quoteQty, isBuyerMaker]
Index: []
it's p
Series([], Name: price, dtype: float64)
it's q
Series([], Name: qty, dtype: float64)
it's vwap
                           trade Id     price    qty  quoteQty  isBuyerMaker
time                                                                        
2021-09-01 00:00:02.050  1391646830  47153.47  0.006    282.92          True
2021-09-01 00:00:02.889  1391646831  47153.47  0.054   2546.28          True
2021-09-01 00:00:02.889  1391646832  47153.47  0.050   2357.67          True
2021-09-01 00:00:02.889  1391646833  47153.47  0.050   2357.67          True
it's p
time
2021-09-01 00:00:02.050    47153.47
2021-09-01 00:00:02.889    47153.47
2021-09-01 00:00:02.889    47153.47
2021-09-01 00:00:02.889    47153.47
Name: price, dtype: float64
it's q
time
2021-09-01 00:00:02.050    0.006
2021-09-01 00:00:02.889    0.054
2021-09-01 00:00:02.889    0.050
2021-09-01 00:00:02.889    0.050
Name: qty, dtype: float64

Installed Versions

INSTALLED VERSIONS ------------------ commit : 5f648bf1706dd75a9ca0d29f26eadfbb595fe52b python : 3.9.5.final.0 python-bits : 64 OS : Linux OS-release : 3.10.0-1160.42.2.el7.x86_64 Version : #1 SMP Tue Sep 7 14:49:57 UTC 2021 machine : x86_64 processor : x86_64 byteorder : little LC_ALL : None LANG : fr_CA.UTF-8 LOCALE : fr_CA.UTF-8 pandas : 1.3.2 numpy : 1.20.3 pytz : 2021.1 dateutil : 2.8.2 pip : 21.1.3 setuptools : 52.0.0.post20210125 Cython : None pytest : None hypothesis : None sphinx : None blosc : None feather : None xlsxwriter : None lxml.etree : 4.6.3 html5lib : None pymysql : None psycopg2 : None jinja2 : 3.0.1 IPython : 7.27.0 pandas_datareader: 0.10.0 bs4 : None bottleneck : 1.3.2 fsspec : None fastparquet : None gcsfs : None matplotlib : 3.4.3 numexpr : 2.7.3 odfpy : None openpyxl : None pandas_gbq : None pyarrow : None pyxlsb : None s3fs : None scipy : 1.7.1 sqlalchemy : None tables : None tabulate : None xarray : None xlrd : None xlwt : None numba : None
mroeschke commented 3 years ago

Could you post a minimal, fully copy-pastable, reproducible example? https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports

Mengda-Li commented 3 years ago
import pandas as pd
from pandas import Timestamp
df = pd.DataFrame({'price': {Timestamp('2021-09-01 00:00:00.023000'): 47150.32,
  Timestamp('2021-09-01 00:00:00.093000'): 47150.33,
  Timestamp('2021-09-01 00:00:00.994000'): 47153.48,
  Timestamp('2021-09-01 00:00:02.050000'): 47153.47,
  Timestamp('2021-09-01 00:00:02.889000'): 47153.47},
 'qty': {Timestamp('2021-09-01 00:00:00.023000'): 0.002,
  Timestamp('2021-09-01 00:00:00.093000'): 0.002,
  Timestamp('2021-09-01 00:00:00.994000'): 0.006,
  Timestamp('2021-09-01 00:00:02.050000'): 0.006,
  Timestamp('2021-09-01 00:00:02.889000'): 0.05},
 'quoteQty': {Timestamp('2021-09-01 00:00:00.023000'): 94.3,
  Timestamp('2021-09-01 00:00:00.093000'): 94.3,
  Timestamp('2021-09-01 00:00:00.994000'): 282.92,
  Timestamp('2021-09-01 00:00:02.050000'): 282.92,
  Timestamp('2021-09-01 00:00:02.889000'): 2357.67}})

r = df.head(10).resample('1s')

def vwap(x):
    print("it's vwap")
    print(x)
    p = x.price
    print("it's p")
    print(p)

    q = x.qty
    print("it's q")
    print(q)

    # print(x.price)
    return (p @ q)/q.sum()

def sum_qty(x):
    print(x)
    return x.qty.sum()

def sum_quoteQty(x):
    return x.quoteQty.sum()

r.apply({"price" : vwap, "qty": sum_qty, "quoteQty": sum_quoteQty})
sappersapper commented 2 years ago

@Mengda-Li

In my understanding, if the func in Resampler.apply(func, *args, **kwargs) is a function, pandas will either try to pass each series of each grouped dataframe into the function, or each grouped dataframe into the function. However; if the func is a list or a dict, pandas will only try to pass each column of each grouped dataframe into functions. That is to say, if the func is a list or a dict, the functions in list or dict can only accept series as input. In your example, r.apply(vwap) can be correctly executed because the x passed into vwap is a dataframe. While r.apply({"price": vwap}) will raise a error because the x passed into vwap is only the column 'price'.

Maybe the wap_func defined as follow suits your needs:

def wap_func(x):
    price_dot_qty = (x.price @ x.qty) / x.qty.sum()
    qty_sum = x.qty.sum()
    quoteQty_sum = x.quoteQty.sum()
    return pd.Series({'price_dot_qty': price_dot_qty, 'qty_sum': qty_sum, 'quoteQty_sum': quoteQty_sum})

r.apply(wap_func)
sappersapper commented 2 years ago

Could it be an pandas enhancement to accept func = [vwap, sum_qty, sum_quoteQty] in Resampler.apply(func, *args, **kwargs) ?

Mengda-Li commented 2 years ago

Could it be an pandas enhancement to accept func = [vwap, sum_qty, sum_quoteQty] in Resampler.apply(func, *args, **kwargs) ?

I think so. It will prevent recalling Resampler.apply for multiple times if we don't know the wap_func can return a pd.Series. (For me, it will save me a lot of computing time.)