-- coding: utf-8 --

""" @author: runnerup Wang """

import pandas as pd

def bin_monotonic(table,feature,direction): """ Merge Adjacent Groups to ensure monotonicity

Parameters:

table: Table with feature bins,sample counts and bad sample counts for each bin
feature: Column name of the binning feature, with values ranging from 0 to the number of bins
direction: Take the Spearson correlation coefficient to determine the monotonicity direction

0 indicates a positive correlation <= 0 indicates a negative correlation

Returns:
bin_merge_dcit: Mapping rules for merging bins
table_merge: Statisitic table for Merged bins with feature name,sample counts,bad sample counts and bad rates

"""

init_table = table[[feature] + ['total','bad']].copy()

for i in range(len(init_table)):

Get the subset of initial table for i to N(the number of bins )
```
subset = init_table.iloc[i:].copy()

# Calculate Cummulative sample counts and bad sample counts 
subset['cum_total'] = subset['total'].cumsum()
subset['cum_bad'] = subset['bad'].cumsum()

# Calculate Calculate badrates
subset['cum_badrate'] = subset['cum_bad'] / subset['cum_total']
# Put Back to the initial table with new columns, with 0 to (i-1) items defaulted with None
init_table['bad_rate_{0}'.format(i)] = [None] * i + list(subset['cum_badrate'])
```
Suggest Using Spearson correlation coefficient to determine direction

if direction <= 0: min_max_indices = pd.DataFrame(init_table.iloc[:, 3:].idxmax()).reset_index(drop=True) else: min_max_indices = pd.DataFrame(init_table.iloc[:, 3:].idxmin()).reset_index(drop=True)

Use cummax functiion to Ensure monotonicity

min_max_indices[0] = min_max_indices[0].cummax()

Convert to dict and Map the initial bins

bin_merge_dcit = min_max_indices[0].to_dict() table[feature + '_Merge'] = table[feature].map(bin_merge_dcit)

Calculate the Statitical table for the Merged Bins

table_Merge = table.groupby(feature + '_Merge').agg( bad = ('bad', 'sum'), total = ('total', 'sum'), )

table_Merge['badrate'] = table_Merge['bad']/table_Merge['total']

return bin_merge_dcit,table_Merge

Ex1: Construct a simple DataSet to test functionality

table = pd.DataFrame({'A':list(range(11)) , 'total':[2437,20720,16813,12679,5647,8232,5445,5276,5432,3514,4681], 'bad':[41,442,366,265,106,152,106,76,76,43,44]}) ex1_dict,ex1_table = bin_monotonic(table,'A',-0.05)

Ex2:

import pandas as pd import numpy as np import toad pd.set_option('display.max_columns',None) pd.set_option('display.max_rows',None)

data = pd.read_csv('/test_data.csv') print('Shape:',data.shape) data.head(10) train = data[:300] OOT = data[300:500]

c = toad.transform.Combiner() c.fit(train_selected.drop(to_drop, axis=1), y = 'target', method = 'quantile') bin_ori = c.export()

Adjusting the precision of split points

bin_adj= bin_ori for k,v in bin_ori.items(): v = [round(i,2) for i in v] v = list(dict.fromkeys(v)) bin_adj[k] = v

c.update(bin_adj)

Visualize Binning Plot

from toad.plot import bin_plot col = 'A' bin_plot(c.transform(train_selected[[col,'target']], labels=True), x=col, target='target')

Merge Bins

from toad.stats import IV, feature_bin_stats from scipy.stats import spearmanr df_temp = c.transform(train_selected[[col,'target']],labels=False) corr = spearmanr(df_temp[col], df_temp['target'])[0] table = feature_bin_stats(df_temp,col, 'target') ex2_dict,ex2_table = bin_monotonic(table,'A',corr)

Position list for Split Point

pos_list = list(set(ex2_dict.values()))

Find the corresponding Split Point

split_list = bin_adj[col] split_list_merge = [split_list[i] for i in pos_list if i < len(split_list)]

Update the rule

rule = {col:split_list_merge} c.update(rule)

bin_plot(c.transform(train_selected[[col,'target']], labels=True), x=col, target='target')

amphibian-dev / toad

Add Binning method to ensure monotonicity for continuous features #150

After a few hours of work, I have developed a function to merge initial bins to ensure monotonicity, see the code and examples below.

Hopefully to get comments from industry peers. If the ESC team could consider optimize this functionality ,and adding to the later version, that will be a great pleasure for me.

-- coding: utf-8 --

Get the subset of initial table for i to N(the number of bins )

Suggest Using Spearson correlation coefficient to determine direction

Use cummax functiion to Ensure monotonicity

Convert to dict and Map the initial bins

Calculate the Statitical table for the Merged Bins