Refactor and reproduce overschrijdingsfrequenties method

Todo:

[x] import hatyan in overschrijding.py, if not compute_overschrijding() will fail >> removed hatyan from commented code
[x] rename compute_overschrijding() to calc_overschrijding() for consistency, also rename plot_distributions() to plot_overschrijding()
[x] no prints within modules, use logging.info() instead (or different level)
[x] use physical break function from https://github.com/Deltares-research/kenmerkendewaarden/pull/61
[x] make interpolate_interested_Tfreqs_to_csv() modular by calling interpolate_interested_Tfreqs()
[x] gecombineerd line does not follow validation line anymore >> fixed with "blend again section" in example code below, but kw.overschrijding.blend_distributions() is not a public function >> solved by adding dist optional argument with which Hydra-NL can already be included in dataframe upon calling calc_overschrijding
[x] make blend_distributions public if required >> not required anymore
[x] include interpolation in calc_overschrijding()
[x] add check for aggers that should not be present with kenmerkendewaarden.utils.raise_extremes_with_aggers (including pytest.raises testcase)
[x] add tests for overschrijdingsfreqs with df_ext_2020_2014 as input
[x] add docstrings

Reproducible code:

import os
import pandas as pd
import matplotlib.pyplot as plt
plt.close('all')
import hatyan
import kenmerkendewaarden as kw # pip install git+https://github.com/Deltares-research/kenmerkendewaarden
from kenmerkendewaarden.overschrijding import blend_distributions # TODO: make public function if required

tstop_dt = "2021-01-01"

dir_base = r'p:\11210325-005-kenmerkende-waarden\work'
dir_meas = os.path.join(dir_base,'measurements_wl_18700101_20240101') # this data is retrieved from the DDL with ddlpy
dir_vali_overschr = os.path.join(dir_base,'data_overschrijding') # TODO: this data is not reproducible yet

stat_list = ['HOEKVHLD']

for current_station in stat_list:

    print(f'loading data for {current_station}')
    data_pd_HWLW_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=True)

    data_pd_HWLW_all_12 = hatyan.calc_HWLW12345to12(data_pd_HWLW_all) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater)
    data_pd_measext = data_pd_HWLW_all_12.loc[:tstop_dt] # only include data up to year_slotgem

    ###OVERSCHRIJDINGSFREQUENTIES
    Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200, #overschrijdingsfreqs
                         1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000]

    print(f'overschrijdingsfrequenties for {current_station}')
    data_pd_HW = data_pd_measext.loc[data_pd_measext['HWLWcode']==1]
    data_pd_LW = data_pd_measext.loc[data_pd_measext['HWLWcode']!=1]

    #get Hydra-NL and KWK-RMM validation data (only for HOEKVHLD)
    dist_vali_exc = {}
    dist_vali_dec = {}
    if current_station =='HOEKVHLD':
        stat_name = 'Hoek_van_Holland'
        print('Load Hydra-NL distribution data and other validation data')
        dist_vali_exc['Hydra-NL'] = pd.read_csv(os.path.join(dir_vali_overschr,'Processed_HydraNL','Without_model_uncertainty',f'{stat_name}.csv'), sep=';', header=[0])
        dist_vali_exc['Hydra-NL']['values'] /= 100 # cm to m
        dist_vali_exc['Hydra-NL met modelonzekerheid'] = pd.read_csv(os.path.join(dir_vali_overschr,'Processed_HydraNL','With_model_uncertainty',f'{stat_name}_with_model_uncertainty.csv'), sep=';', header=[0])
        dist_vali_exc['Hydra-NL met modelonzekerheid']['values'] /= 100 # cm to m
        file_vali_exeed = os.path.join(dir_vali_overschr,'Tables','Exceedance_lines',f'Exceedance_lines_{stat_name}.csv')
        if os.path.exists(file_vali_exeed):
            dist_vali_exc['validation'] = pd.read_csv(file_vali_exeed,sep=';')
            dist_vali_exc['validation']['values'] /= 100
        # file_vali_dec = os.path.join(dir_vali_overschr,'Tables','Deceedance_lines',f'Deceedance_lines_{stat_name}.csv')
        # if os.path.exists(file_vali_dec):
        #     dist_vali_dec['validation'] = pd.read_csv(file_vali_dec,sep=';')
        #     dist_vali_dec['validation']['values'] /= 100

    #set station rules
    station_rule_type = 'break'
    station_break_value = data_pd_measext.index.min()

    # 1. Exceedance
    print('Exceedance') #TODO: hatyan.get_weibull.der_pfunc() throws "RuntimeWarning: invalid value encountered in double_scalars"
    dist_exc = kw.compute_overschrijding(data_pd_HW, rule_type=station_rule_type, rule_value=station_break_value)
    dist_exc.update(dist_vali_exc)

    # blend again
    dist_exc['Gecombineerd'] = blend_distributions(dist_exc['Trendanalyse'].copy().reset_index(drop=True),
                                                   dist_exc['Weibull'].copy(),
                                                   dist_exc['Hydra-NL'].copy(), 
                                                   )

    df_interp = kw.interpolate_interested_Tfreqs(dist_exc['Gecombineerd'], Tfreqs=Tfreqs_interested)

    fig, ax = kw.plot_distributions(dist_exc, name=current_station, color_map='default')
    ax.set_ylim(0,5.5)

    # # 2. Deceedance
    # print('Deceedance')
    # dist_dec = kw.compute_overschrijding(data_pd_LW, rule_type=station_rule_type, rule_value=station_break_value, inverse=True)
    # dist_dec.update(dist_vali_dec)
    # df_interp = kw.interpolate_interested_Tfreqs(dist_dec['Gecombineerd'], Tfreqs=Tfreqs_interested)

    # fig, ax = kw.plot_distributions(dist_dec, name=current_station, color_map='default')

Gives:

Deltares-research / kenmerkendewaarden

Refactor and reproduce overschrijdingsfrequenties method #26