Open RunnerupWang opened 2 months ago
`#!/usr/bin/env python3
""" @author: runnerup Wang """
import pandas as pd
def bin_monotonic(table,feature,direction): """ Merge Adjacent Groups to ensure monotonicity
Parameters:
direction: Take the Spearson correlation coefficient to determine the monotonicity direction
0 indicates a positive correlation <= 0 indicates a negative correlation
Returns:
table_merge: Statisitic table for Merged bins with feature name,sample counts,bad sample counts and bad rates
"""
init_table = table[[feature] + ['total','bad']].copy()
for i in range(len(init_table)):
subset = init_table.iloc[i:].copy()
# Calculate Cummulative sample counts and bad sample counts
subset['cum_total'] = subset['total'].cumsum()
subset['cum_bad'] = subset['bad'].cumsum()
# Calculate Calculate badrates
subset['cum_badrate'] = subset['cum_bad'] / subset['cum_total']
# Put Back to the initial table with new columns, with 0 to (i-1) items defaulted with None
init_table['bad_rate_{0}'.format(i)] = [None] * i + list(subset['cum_badrate'])
if direction <= 0: min_max_indices = pd.DataFrame(init_table.iloc[:, 3:].idxmax()).reset_index(drop=True) else: min_max_indices = pd.DataFrame(init_table.iloc[:, 3:].idxmin()).reset_index(drop=True)
min_max_indices[0] = min_max_indices[0].cummax()
bin_merge_dcit = min_max_indices[0].to_dict() table[feature + '_Merge'] = table[feature].map(bin_merge_dcit)
table_Merge = table.groupby(feature + '_Merge').agg( bad = ('bad', 'sum'), total = ('total', 'sum'), )
table_Merge['badrate'] = table_Merge['bad']/table_Merge['total']
return bin_merge_dcit,table_Merge
table = pd.DataFrame({'A':list(range(11)) , 'total':[2437,20720,16813,12679,5647,8232,5445,5276,5432,3514,4681], 'bad':[41,442,366,265,106,152,106,76,76,43,44]}) ex1_dict,ex1_table = bin_monotonic(table,'A',-0.05)
import pandas as pd import numpy as np import toad pd.set_option('display.max_columns',None) pd.set_option('display.max_rows',None)
data = pd.read_csv('/test_data.csv') print('Shape:',data.shape) data.head(10) train = data[:300] OOT = data[300:500]
c = toad.transform.Combiner() c.fit(train_selected.drop(to_drop, axis=1), y = 'target', method = 'quantile') bin_ori = c.export()
bin_adj= bin_ori for k,v in bin_ori.items(): v = [round(i,2) for i in v] v = list(dict.fromkeys(v)) bin_adj[k] = v
c.update(bin_adj)
from toad.plot import bin_plot col = 'A' bin_plot(c.transform(train_selected[[col,'target']], labels=True), x=col, target='target')
from toad.stats import IV, feature_bin_stats from scipy.stats import spearmanr df_temp = c.transform(train_selected[[col,'target']],labels=False) corr = spearmanr(df_temp[col], df_temp['target'])[0] table = feature_bin_stats(df_temp,col, 'target') ex2_dict,ex2_table = bin_monotonic(table,'A',corr)
pos_list = list(set(ex2_dict.values()))
split_list = bin_adj[col] split_list_merge = [split_list[i] for i in pos_list if i < len(split_list)]
rule = {col:split_list_merge} c.update(rule)
bin_plot(c.transform(train_selected[[col,'target']], labels=True), x=col, target='target')
`
The existing binning method, such as chi-square, decision tree, quantile, cannot guarantee monotonicity for continuous features. While for a scorecard in commercial use, we usually require interpretability, and monotonicity is needed. Here, I suggest adding monotonicity for the existing binning methods, especially for quantile binning method. Look forward to your reply, thanks a lot.