Deltares-research / kenmerkendewaarden

Derive indicators from waterlevel measurements
https://deltares-research.github.io/kenmerkendewaarden/
GNU General Public License v3.0
1 stars 0 forks source link

add support for pandas>=2.2.0 #53

Closed veenstrajelmer closed 2 months ago

veenstrajelmer commented 2 months ago

With pandas 2.2.0 we get different havengetallen values, because all culmination hours seem to be shifted. This code will help to debug it further. All assertions will succeed with pandas<=2.1.0, but some will fail with pandas>=2.2.0.

### HAVENGETALLEN
import kenmerkendewaarden as kw
import hatyan
hatyan.close("all")
import numpy as np
import logging
logging.basicConfig(format='%(message)s')
logging.getLogger("kenmerkendewaarden").setLevel(level="INFO")
import datetime as dt
import pandas as pd

# dir_meas = r"c:\Users\veenstra\Downloads\data_temp"
dir_meas = r"p:\11210325-005-kenmerkende-waarden\work\measurements_wl_18700101_20240101"
dir_meas = r"c:\Users\veenstra\Downloads"

df_ext = kw.read_measurements(dir_output=dir_meas, station="HOEKVHLD", extremes=True)
df_ext_12 = hatyan.calc_HWLW12345to12(df_ext)
tstop_str = "2010" # "2010" "2020"
df_ext_12 = df_ext_12.loc["2010":tstop_str]

# df_havengetallen = kw.calc_havengetallen(df_ext=df_ext_12)
current_station = df_ext_12.attrs["station"]
# TODO: check culm_addtime and HWLWno+4 offsets. culm_addtime could also be 2 days or 2days +1h GMT-MET correction. 20 minutes seems odd since moonculm is about tidal wave from ocean
# culm_addtime is a 2d and 2u20min correction, this shifts the x-axis of aardappelgrafiek
# HW is 2 days after culmination (so 4x25min difference between length of avg moonculm and length of 2 days)
# 1 hour (GMT to MET, alternatively we can also account for timezone differences elsewhere)
# 20 minutes (0 to 5 meridian)
culm_addtime = 4*dt.timedelta(hours=12,minutes=25) + dt.timedelta(hours=1) - dt.timedelta(minutes=20)

# TODO: move calc_HWLW_moonculm_combi() to top since it is the same for all stations
# TODO: we added tz_localize on 29-5-2024 (https://github.com/Deltares-research/kenmerkendewaarden/issues/30)
# this means we pass a UTC+1 timeseries as if it were a UTC timeseries
if df_ext.index.tz is not None:
    df_ext_12 = df_ext_12.tz_localize(None)

# kw.havengetallen.get_moonculm_idxHWLWno()
data_pd_moonculm = hatyan.astrog.astrog_culminations(tFirst=df_ext_12.index.min()-dt.timedelta(days=3),tLast=df_ext_12.index.max()) # in UTC, which is important since data_pd_HWLW['culm_hr']=range(12) hourvalues should be in UTC since that relates to the relation dateline/sun
data_pd_moonculm['datetime'] = data_pd_moonculm['datetime'].dt.tz_convert('UTC') #convert to UTC (is already)
data_pd_moonculm['datetime'] = data_pd_moonculm['datetime'].dt.tz_localize(None) #remove timezone
data_pd_moonculm = data_pd_moonculm.set_index('datetime',drop=False)
data_pd_moonculm['values'] = data_pd_moonculm['type'] #dummy values for TA in hatyan.calc_HWLWnumbering()
data_pd_moonculm['HWLWcode'] = 1 #all HW values since one every ~12h25m
data_pd_moonculm = hatyan.calc_HWLWnumbering(data_pd_moonculm,doHWLWcheck=False) #TODO: currently w.r.t. cadzd, is that an issue? With DELFZL the matched culmination is incorrect (since far away), but that might not be a big issue
moonculm_idxHWLWno = data_pd_moonculm.set_index('HWLWno')

df_ext_12_culm = kw.havengetallen.calc_HWLW_moonculm_combi(data_pd_HWLW_12=df_ext_12, culm_addtime=culm_addtime) #culm_addtime=None provides the same gemgetijkromme now delay is not used for scaling anymore
# moonculm_idxHWLWno = kw.havengetallen.get_moonculm_idxHWLWno(tstart=df_ext_12.index.min()-dt.timedelta(days=3),tstop=df_ext_12.index.max())
# moonculm_idxHWLWno.index = moonculm_idxHWLWno.index + 4 #correlate HWLW to moonculmination 2 days before. TODO: check this offset in relation to culm_addtime.

# data_pd_HWLW_idxHWLWno = hatyan.calc_HWLWnumbering(df_ext_12)
# data_pd_HWLW_idxHWLWno['times'] = data_pd_HWLW_idxHWLWno.index
# data_pd_HWLW_idxHWLWno = data_pd_HWLW_idxHWLWno.set_index('HWLWno',drop=False)

# HW_bool = data_pd_HWLW_idxHWLWno['HWLWcode']==1
# data_pd_HWLW_idxHWLWno.loc[HW_bool,'getijperiod'] = data_pd_HWLW_idxHWLWno.loc[HW_bool,'times'].iloc[1:].values - data_pd_HWLW_idxHWLWno.loc[HW_bool,'times'].iloc[:-1] #this works properly since index is HWLW
# data_pd_HWLW_idxHWLWno.loc[HW_bool,'duurdaling'] = data_pd_HWLW_idxHWLWno.loc[~HW_bool,'times'] - data_pd_HWLW_idxHWLWno.loc[HW_bool,'times']
# data_pd_HWLW_idxHWLWno['culm_time'] = moonculm_idxHWLWno['datetime'] #couple HWLW to moonculminations two days earlier (this works since index is HWLWno)
# data_pd_HWLW_idxHWLWno['culm_hr'] = (data_pd_HWLW_idxHWLWno['culm_time'].round('h').dt.hour)%12
# data_pd_HWLW_idxHWLWno['HWLW_delay'] = data_pd_HWLW_idxHWLWno['times'] - data_pd_HWLW_idxHWLWno['culm_time']
# if culm_addtime is not None:
#     data_pd_HWLW_idxHWLWno['HWLW_delay'] -= culm_addtime
# df_ext_12_culm = data_pd_HWLW_idxHWLWno.set_index('times')

# TODO: this plot is shifted between pandas 2.1.0 and pandas 2.2.0
df_ext_12_culm["culm_hr"].plot()

if tstop_str=="2010":
    # df_ext_12_culm times/length seem identical between pandas versions
    assert df_ext_12_culm.index[0] == pd.Timestamp('2010-01-01 02:35:00')
    assert df_ext_12_culm.index[-1] == pd.Timestamp('2010-12-31 23:50:00')
    assert len(df_ext_12_culm) == 1411

    # moonculm_idxHWLWno seem identical between pandas versions
    assert len(moonculm_idxHWLWno) == 711
    assert (moonculm_idxHWLWno.index[:8].values == np.array([7053, 7054, 7055, 7056, 7057, 7058, 7059, 7060])).all() # do note `moonculm_idxHWLWno.index + 4`
    assert (moonculm_idxHWLWno.index[-8:].values == np.array([7756, 7757, 7758, 7759, 7760, 7761, 7762, 7763])).all() # do note `moonculm_idxHWLWno.index + 4`
    assert moonculm_idxHWLWno["datetime"].min() == pd.Timestamp('2009-12-29 09:37:07.049591019')
    assert moonculm_idxHWLWno["datetime"].max() == pd.Timestamp('2010-12-31 20:57:15.209495865')
    assert moonculm_idxHWLWno["datetime"].iloc[0] == pd.Timestamp('2009-12-29 09:37:07.049591019')
    assert moonculm_idxHWLWno["datetime"].iloc[-1] == pd.Timestamp('2010-12-31 20:57:15.209495865')

    # df_ext_12_culm culm_hr seems shifted between pandas version, also check the plot created above
    assert np.isclose(df_ext_12_culm["culm_hr"].min(), 0)
    assert np.isclose(df_ext_12_culm["culm_hr"].max(), 11)
    assert np.isclose(df_ext_12_culm["culm_hr"].iloc[0], 10)
    assert np.isclose(df_ext_12_culm["culm_hr"].iloc[-1], 7)
    count_hr = df_ext_12_culm.groupby("culm_hr").count()["values"].values
    assert (count_hr == np.array([116, 112, 116, 122, 118, 116, 124, 123, 116, 112, 110, 126])).all() # pandas 2.1.0
    # assert (count_hr == np.array([112, 116, 120, 122, 120, 120, 118, 121, 110, 112, 120, 120])).all() # pandas 2.2.0
    assert df_ext_12_culm["HWLW_delay"].min() == pd.Timedelta('0 days 00:09:26.723818778')
    assert df_ext_12_culm["HWLW_delay"].max() == pd.Timedelta('0 days 10:58:41.926870968')
    assert df_ext_12_culm["HWLW_delay"].iloc[0] == pd.Timedelta('0 days 02:06:52.757993453')
    assert df_ext_12_culm["HWLW_delay"].iloc[-1] == pd.Timedelta('0 days 02:21:37.475543290')
elif tstop_str=="2020":
    assert df_ext_12_culm.index[0] == pd.Timestamp('2010-01-01 02:35:00')
    assert df_ext_12_culm.index[-1] == pd.Timestamp('2020-12-31 22:59:00')
    assert len(df_ext_12_culm) == 15528
    assert np.isclose(df_ext_12_culm["culm_hr"].min(), 0)
    assert np.isclose(df_ext_12_culm["culm_hr"].max(), 11)
    assert np.isclose(df_ext_12_culm["culm_hr"].iloc[0], 10)
    assert np.isclose(df_ext_12_culm["culm_hr"].iloc[-1], 11)
    count_hr = df_ext_12_culm.groupby("culm_hr").count()["values"].values
    assert (count_hr == np.array([1238, 1288, 1292, 1298, 1304, 1288, 1352, 1290, 1332, 1290, 1300, 1256])).all() # pandas 2.1.0
    # assert (count_hr == np.array([1258, 1314, 1280, 1306, 1292, 1348, 1298, 1304, 1312, 1296, 1290, 1230])).all() # pandas 2.2.0
    assert df_ext_12_culm["HWLW_delay"].min() == pd.Timedelta('0 days 00:05:31.289305443')
    assert df_ext_12_culm["HWLW_delay"].max() == pd.Timedelta('0 days 12:17:03.623306535')
    assert df_ext_12_culm["HWLW_delay"].iloc[0] == pd.Timedelta('0 days 02:06:52.757993453')
    assert df_ext_12_culm["HWLW_delay"].iloc[-1] == pd.Timedelta('0 days 09:10:15.536207002')

Todo:

veenstrajelmer commented 2 months ago

This happens because of a bug in pandas>=2.2.0: https://github.com/pandas-dev/pandas/issues/57781 It is solved by adding the .dt accessor before rounding.