Sort out three `discretize` functions/methods

There is one BNdata.discretize method and one function discretize defined in other_functions.py, and yet a third defined in Helper_functions.py. They output different results, but the only one that's referenced in the script is the one from the Helper_functions.py script.

Code

(Code pasted here for easy access.)

`BNdata.discretize`

def discretize (self, binRangesDict, plot=False):
    binnedDf = pd.DataFrame().reindex_like(self.data)

    binCountsDict = copy.deepcopy(binRangesDict)  # copy trainingDfDiscterizedRangesDict
    for key in binCountsDict:
        for bin in binCountsDict[key]:
            del bin[:]
            bin.append(0)

    for varName in list(self.data):
        # load discretized ranges belonging to varName in order to bin in
        discreteRanges = binRangesDict.get(varName)

        index = 0
        for item1 in self.data[varName]:
            for i in range(len(discreteRanges)):
                binRange = discreteRanges[i]

                ############ bin training data #############

                if binRange[0] <= item1 <= binRange[1]:
                    # print item1,' lies within ',binRange
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                if i == 0 and binRange[0] > item1:
                    # print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                if i == len(discreteRanges) - 1 and binRange[1] < item1:
                    # print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

            index += 1

    binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries
    self.binnedData = binnedData

    print 'train binCountdict ', binCountsDict
    print 'binned_trainingData ', binnedData
    return binnedData

`other_functions.discretize`

def discretize(data, vars_to_discretize, n_bins):
    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and
    a dictionary containing the number of bins for selected variables.
    Returns data after selected variables have been discretized,
    together with binning definition for each variable.
    '''

    data_subset = pd.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:
        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal':
            out, binning = pd.cut(data_subset.ix[:, i], bins=n_bins[i], labels=False, retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = pd.qcut(data_subset.ix[:, i], q=nb, labels=False, retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:, i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:, i] = out

        # replace NA variables with and special index (1+max) -
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:, i][data_subset.ix[:, i].isnull()] = data_subset.ix[:, i].max() + 1
        bins[i] = binning

    return data_subset, bins

`Helper_functions.discretize`

def discretize (dataframe, binRangesDict, plot=False):
    binnedDf = pd.DataFrame().reindex_like(dataframe)

    binCountsDict = copy.deepcopy(binRangesDict)  # copy trainingDfDiscterizedRangesDict
    for key in binCountsDict:
        for bin in binCountsDict[key]:
            del bin[:]
            bin.append(0)

    for varName in binRangesDict.keys():
        # load discretized ranges belonging to varName in order to bin in
        discreteRanges = binRangesDict.get(varName)

        index = 0
        for item1 in dataframe[varName]:
            for i in range(len(discreteRanges)):
                binRange = discreteRanges[i]

                ############ bin training data #############
                if i==0: # if this is first bin then bin numbers larger or equal than min num and less or equal than max num (basically, include min num)
                    if binRange[0] <= item1 <= binRange[1]:
                        # print item1,' is binned within ',binRange
                        binnedDf.iloc[index][varName] = i
                        binCountsDict[varName][i][0] += 1

                else: # if not first bin bin numbers less or equal to max num
                    if binRange[0] < item1 <= binRange[1]:
                        # print item1,' is binned within ',binRange
                        binnedDf.iloc[index][varName] = i
                        binCountsDict[varName][i][0] += 1

                # catch values outside of range (smaller than min)
                if i == 0 and binRange[0] > item1:
                    # print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                # catch values outside of range (larger than max)
                if i == len(discreteRanges) - 1 and binRange[1] < item1:
                    # print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

            index += 1

    binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries

    return binnedData, binnedDf, binCountsDict

zackxconti / bnmetamodel_gh

Sort out three `discretize` functions/methods #54

Suggested solution

Code

`BNdata.discretize`

`other_functions.discretize`

`Helper_functions.discretize`