Closed auderson closed 3 years ago
confirmed...
ser = pd.Series(np.zeros(5))
ser.iloc[0] = 1000
ser.rolling(3).std()
0 NaN
1 NaN
2 577.350269
3 0.000008
4 0.000008
dtype: float64
This is actually a result of the rolling.std
method using the roll_var
method which returns:
[nan nan 3.333e+05 5.821e-11 5.821e-11]
and the numpy square root
[nan nan 5.774e+02 7.629e-06 7.692e-06]
which we see in the result.
This is to do with computer precision and the implementation of roll_var
which apparently uses:
""" Numerically stable implementation using Welford's method. """. So maybe Welford can be improved here? :)
this was fixed for 1.3
@jreback I got my results, and the same problem, on: 1.4.0.dev0+41.gd67f6a678c.dirty
Note: this is called out in the user guide in the warning block https://pandas.pydata.org/docs/user_guide/window.html#overview
and even thought 1.3 does have a change in rolling.std/var
, we note that there will be numerical imprecision as well: https://pandas.pydata.org/pandas-docs/dev/whatsnew/v1.3.0.html#removed-artificial-truncation-in-rolling-variance-and-standard-deviation
There will be likely be no definitive fix for this given the algorithm we're using to calculate rolling aggregations
Just ran into this bug as well. Kind of nasty, given that you'd totally expect var(same values) to be 0.0 exactly.
Thanks for the report, but since this is an implementation detail related to our algorithm (and it's been documented). I think this issue can be closed.
@mroeschke Hi! I came across a way to fix this today and I'll expain it below. If you think this is helpful enough, I'm willing to make a PR.
The code below is a numba version of rolling var, mostly copied from https://github.com/pandas-dev/pandas/blob/227892332ff058efe8af31a1f61a96ae5aaa0d7a/pandas/_libs/window/aggregations.pyx#L277-L405 except that I modified it to work on 2-d inputs.
import timeit
import numpy as np
import pandas as pd
from numba import njit
@njit(nogil=True, inline='always')
def add_var_old(row, nobs, mean_x,
ssqdm_x, compensation):
""" add a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
if val != val:
continue
nobs[j] += 1
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
if nobs[j]:
mean_x[j] = mean_x[j] + delta / nobs[j]
else:
mean_x[j] = 0
ssqdm_x[j] = ssqdm_x[j] + (val - prev_mean) * (val - mean_x[j])
@njit(nogil=True, inline='always')
def calc_var_old(minp, ddof, nobs, ssqdm_x):
# cdef:
# float64_t result
result = np.empty(len(nobs))
for i, (_nobs, _ssqdm_x) in enumerate(zip(nobs, ssqdm_x)):
if (_nobs >= minp) and (_nobs > ddof):
# pathological case
if _nobs == 1:
result[i] = 0
else:
result[i] = _ssqdm_x / (_nobs - ddof)
else:
result[i] = np.nan
return result
@njit(nogil=True)
def roll_var_old(values, wind, minp, ddof=1) -> np.ndarray:
"""
Numerically stable implementation using Welford's method.
"""
# cdef:
# float64_t mean_x, ssqdm_x, nobs, compensation_add,
# float64_t compensation_remove,
# float64_t val, prev, delta, mean_x_old
# int64_t s, e
# Py_ssize_t i, j, N = len(start)
# ndarray[float64_t] output
# bint is_monotonic_increasing_bounds
minp = max(minp, 1)
# is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
# start, end
# )
n, m = values.shape
output = np.empty((n, m), dtype=np.float64)
mean_x = np.zeros(m)
ssqdm_x = np.zeros(m)
nobs = np.zeros(m)
compensation_add = np.zeros(m)
compensation_remove = np.zeros(m)
for i in range(n - wind + 1):
s = i
e = wind + s
# Over the first window, observations can only be added
# never removed
if i == 0:
for j in range(s, e):
add_var_old(values[j], nobs, mean_x, ssqdm_x, compensation_add)
output[j] = np.nan
else:
# After the first window, observations can both be added
# and removed
# calculate deletes
remove_var(values[s - 1], nobs, mean_x, ssqdm_x, compensation_remove)
# calculate adds
add_var_old(values[e - 1], nobs, mean_x, ssqdm_x, compensation_add)
output[e - 1] = calc_var_old(minp, ddof, nobs, ssqdm_x)
return output
@njit(nogil=True, inline='always')
def add_var_new(row, nobs, mean_x,
ssqdm_x, compensation, n_same_value, prev_value):
""" add a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
if val != val:
continue
nobs[j] += 1
if val == prev_value[j]:
n_same_value[j] += 1 # incr count
else:
n_same_value[j] = 1 # if not same, set count to 1 (include itself)
prev_value[j] = val # store prev value
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
if nobs[j]:
mean_x[j] = mean_x[j] + delta / nobs[j]
else:
mean_x[j] = 0
ssqdm_x[j] = ssqdm_x[j] + (val - prev_mean) * (val - mean_x[j])
# no change here
@njit(nogil=True, inline='always')
def remove_var(row, nobs, mean_x,
ssqdm_x, compensation):
""" remove a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
if val == val:
nobs[j] -= 1
if nobs[j]:
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
mean_x[j] = mean_x[j] - delta / nobs[j]
ssqdm_x[j] = ssqdm_x[j] - (val - prev_mean) * (val - mean_x[j])
else:
mean_x[j] = 0
ssqdm_x[j] = 0
@njit(nogil=True, inline='always')
def calc_var_new(minp, ddof, nobs,
ssqdm_x, n_same_value):
# cdef:
# float64_t result
result = np.empty(len(nobs))
for i, (_nobs, _ssqdm_x, _nsv) in enumerate(zip(nobs, ssqdm_x, n_same_value)):
if (_nobs >= minp) and (_nobs > ddof):
# pathological case
if _nobs == 1 or _nsv >= _nobs: # if num same value >= num obs
result[i] = 0
else:
result[i] = _ssqdm_x / (_nobs - ddof)
else:
result[i] = np.nan
return result
@njit(nogil=True)
def roll_var_new(values, wind, minp, ddof=1) -> np.ndarray:
"""
Numerically stable implementation using Welford's method.
"""
# cdef:
# float64_t mean_x, ssqdm_x, nobs, compensation_add,
# float64_t compensation_remove,
# float64_t val, prev, delta, mean_x_old
# int64_t s, e
# Py_ssize_t i, j, N = len(start)
# ndarray[float64_t] output
# bint is_monotonic_increasing_bounds
minp = max(minp, 1)
# is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
# start, end
# )
n, m = values.shape
output = np.empty((n, m), dtype=np.float64)
mean_x = np.zeros(m)
ssqdm_x = np.zeros(m)
nobs = np.zeros(m)
compensation_add = np.zeros(m)
compensation_remove = np.zeros(m)
n_same_value = np.zeros(m, dtype=np.int64)
prev_value = values[0].copy() # record first value
for i in range(n - wind + 1):
s = i
e = wind + s
# Over the first window, observations can only be added
# never removed
if i == 0:
for j in range(s, e):
add_var_new(values[j], nobs, mean_x, ssqdm_x, compensation_add, n_same_value, prev_value)
output[j] = np.nan
else:
# After the first window, observations can both be added
# and removed
# calculate deletes
remove_var(values[s - 1], nobs, mean_x, ssqdm_x, compensation_remove)
# calculate adds
add_var_new(values[e - 1], nobs, mean_x, ssqdm_x, compensation_add, n_same_value, prev_value)
output[e - 1] = calc_var_new(minp, ddof, nobs, ssqdm_x, n_same_value)
return output
I record the previous value in add_var_new
and count num of consecutively same value (n_same_value
). If n_same_value
>= min_periods(nobs
), then I set result as zero in calc_var_new
.
Then first let's have a df with big number at the top:
df = pd.DataFrame(np.zeros(10))
df.iloc[0] = 1000
df.rolling(5).std()
Out[32]:
0
0 NaN
1 NaN
2 NaN
3 NaN
4 2.000000e+05
5 2.910383e-11
6 2.910383e-11
7 2.910383e-11
8 2.910383e-11
9 2.910383e-11
It's same as numba version:
roll_var_old(df.values, 5, 5, 1)
Out[39]:
array([[ nan],
[ nan],
[ nan],
[ nan],
[2.00000000e+05],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11]])
My version:
roll_var_new(df.values, 5, 5, 1)
Out[40]:
array([[ nan],
[ nan],
[ nan],
[ nan],
[200000.],
[ 0.],
[ 0.],
[ 0.],
[ 0.],
[ 0.]])
array = np.random.randn(1000, 10)
times_old = timeit.repeat('roll_var_old(array, 10, 10, 1)', number=1000, globals=globals())
np.sum(times_old)
# 0.1767864041030407
times_new = timeit.repeat('roll_var_new(array, 10, 10, 1)', number=1000, globals=globals())
np.sum(times_new)
# 0.20479978621006012
So the 2D input is the reason why there are no floating point artifacts? I can't easily discern what the old vs new difference is based on all the code.
But generally, if different processing removes these floating point artifacts (and passes all the tests), IMO a performance hit is okay.
So in short, the new algo will record and count how many same values have appeared:
# in `add_var`
if val == prev_value[0] and val != MAXfloat64 and val != MINfloat64:
n_same_value[0] += 1 # incr count
else:
n_same_value[0] = 1 # if not same, reset count to 1 (include itself)
prev_value[0] = val # store prev value
If values are the same, we use 0 as the result, instead of _ssqdm_x / (_nobs - ddof)
# in `calc_var`
if (nobs >= minp) and (nobs > ddof):
# pathological case & repeatedly same values case
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0
else:
result = ssqdm_x / (nobs - <float64_t>ddof)
else:
result = NaN
The floating point artifacts comes from _ssqdm_x / (_nobs - ddof)
, and if we know the values are actually the same, I think it's safe to use 0 as the result (instead of 2.91038305e-11 from the example)
[x] I have checked that this issue has not already been reported.
[x] I have confirmed this bug exists on the latest version of pandas.
[ ] (optional) I have confirmed this bug exists on the master branch of pandas.
Problem description
If we have a relatively big number at the top, and the remaining is zeros, then the rolling result is not zero
Expected Output
Output of
pd.show_versions()