There is one BNdata.discretize method and one function discretize defined in other_functions.py, and yet a third defined in Helper_functions.py. They output different results, but the only one that's referenced in the script is the one from the Helper_functions.py script.
Suggested solution
Would it be possible to remove (or rename) the BNdata.discretize method so as to not confuse future users?
Code
(Code pasted here for easy access.)
BNdata.discretize
def discretize (self, binRangesDict, plot=False):
binnedDf = pd.DataFrame().reindex_like(self.data)
binCountsDict = copy.deepcopy(binRangesDict) # copy trainingDfDiscterizedRangesDict
for key in binCountsDict:
for bin in binCountsDict[key]:
del bin[:]
bin.append(0)
for varName in list(self.data):
# load discretized ranges belonging to varName in order to bin in
discreteRanges = binRangesDict.get(varName)
index = 0
for item1 in self.data[varName]:
for i in range(len(discreteRanges)):
binRange = discreteRanges[i]
############ bin training data #############
if binRange[0] <= item1 <= binRange[1]:
# print item1,' lies within ',binRange
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
if i == 0 and binRange[0] > item1:
# print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
if i == len(discreteRanges) - 1 and binRange[1] < item1:
# print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
index += 1
binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries
self.binnedData = binnedData
print 'train binCountdict ', binCountsDict
print 'binned_trainingData ', binnedData
return binnedData
other_functions.discretize
def discretize(data, vars_to_discretize, n_bins):
'''
Accepts data, a dictionary containing dicretization type for selected variables, and
a dictionary containing the number of bins for selected variables.
Returns data after selected variables have been discretized,
together with binning definition for each variable.
'''
data_subset = pd.DataFrame(data).copy()
bins = {}
for i in vars_to_discretize:
out = None
binning = None
# discretize by splitting into equal intervals
if vars_to_discretize[i] == 'Equal':
out, binning = pd.cut(data_subset.ix[:, i], bins=n_bins[i], labels=False, retbins=True)
# discretize by frequency
elif vars_to_discretize[i] == 'Freq':
nb = n_bins[i]
while True:
try:
out, binning = pd.qcut(data_subset.ix[:, i], q=nb, labels=False, retbins=True)
break
except:
nb -= 1
# discretize based on provided bin margins
elif vars_to_discretize[i] == 'Bins':
out = np.digitize(data_subset.ix[:, i], n_bins[i], right=True) - 1
binning = n_bins[i]
data_subset.ix[:, i] = out
# replace NA variables with and special index (1+max) -
# if it has not been done so automatically an in np.digitize
data_subset.ix[:, i][data_subset.ix[:, i].isnull()] = data_subset.ix[:, i].max() + 1
bins[i] = binning
return data_subset, bins
Helper_functions.discretize
def discretize (dataframe, binRangesDict, plot=False):
binnedDf = pd.DataFrame().reindex_like(dataframe)
binCountsDict = copy.deepcopy(binRangesDict) # copy trainingDfDiscterizedRangesDict
for key in binCountsDict:
for bin in binCountsDict[key]:
del bin[:]
bin.append(0)
for varName in binRangesDict.keys():
# load discretized ranges belonging to varName in order to bin in
discreteRanges = binRangesDict.get(varName)
index = 0
for item1 in dataframe[varName]:
for i in range(len(discreteRanges)):
binRange = discreteRanges[i]
############ bin training data #############
if i==0: # if this is first bin then bin numbers larger or equal than min num and less or equal than max num (basically, include min num)
if binRange[0] <= item1 <= binRange[1]:
# print item1,' is binned within ',binRange
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
else: # if not first bin bin numbers less or equal to max num
if binRange[0] < item1 <= binRange[1]:
# print item1,' is binned within ',binRange
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
# catch values outside of range (smaller than min)
if i == 0 and binRange[0] > item1:
# print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
# catch values outside of range (larger than max)
if i == len(discreteRanges) - 1 and binRange[1] < item1:
# print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
binnedDf.iloc[index][varName] = i
binCountsDict[varName][i][0] += 1
index += 1
binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries
return binnedData, binnedDf, binCountsDict
There is one
BNdata.discretize
method and one functiondiscretize
defined inother_functions.py
, and yet a third defined inHelper_functions.py
. They output different results, but the only one that's referenced in the script is the one from theHelper_functions.py
script.Suggested solution
Would it be possible to remove (or rename) the
BNdata.discretize
method so as to not confuse future users?Code
(Code pasted here for easy access.)
BNdata.discretize
other_functions.discretize
Helper_functions.discretize