zackxconti / bnmetamodel_gh

Repo for bnmetamodel lib version for Lab Mouse Grasshopper plug-in.
1 stars 2 forks source link

Sort out three `discretize` functions/methods #54

Open kallewesterling opened 1 year ago

kallewesterling commented 1 year ago

There is one BNdata.discretize method and one function discretize defined in other_functions.py, and yet a third defined in Helper_functions.py. They output different results, but the only one that's referenced in the script is the one from the Helper_functions.py script.

Suggested solution

Would it be possible to remove (or rename) the BNdata.discretize method so as to not confuse future users?

Code

(Code pasted here for easy access.)

BNdata.discretize

def discretize (self, binRangesDict, plot=False):
    binnedDf = pd.DataFrame().reindex_like(self.data)

    binCountsDict = copy.deepcopy(binRangesDict)  # copy trainingDfDiscterizedRangesDict
    for key in binCountsDict:
        for bin in binCountsDict[key]:
            del bin[:]
            bin.append(0)

    for varName in list(self.data):
        # load discretized ranges belonging to varName in order to bin in
        discreteRanges = binRangesDict.get(varName)

        index = 0
        for item1 in self.data[varName]:
            for i in range(len(discreteRanges)):
                binRange = discreteRanges[i]

                ############ bin training data #############

                if binRange[0] <= item1 <= binRange[1]:
                    # print item1,' lies within ',binRange
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                if i == 0 and binRange[0] > item1:
                    # print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                if i == len(discreteRanges) - 1 and binRange[1] < item1:
                    # print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

            index += 1

    binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries
    self.binnedData = binnedData

    print 'train binCountdict ', binCountsDict
    print 'binned_trainingData ', binnedData
    return binnedData

other_functions.discretize

def discretize(data, vars_to_discretize, n_bins):
    '''
    Accepts data, a dictionary containing dicretization type for selected variables, and
    a dictionary containing the number of bins for selected variables.
    Returns data after selected variables have been discretized,
    together with binning definition for each variable.
    '''

    data_subset = pd.DataFrame(data).copy()
    bins = {}
    for i in vars_to_discretize:
        out = None
        binning = None

        # discretize by splitting into equal intervals
        if vars_to_discretize[i] == 'Equal':
            out, binning = pd.cut(data_subset.ix[:, i], bins=n_bins[i], labels=False, retbins=True)

        # discretize by frequency
        elif vars_to_discretize[i] == 'Freq':
            nb = n_bins[i]
            while True:
                try:
                    out, binning = pd.qcut(data_subset.ix[:, i], q=nb, labels=False, retbins=True)
                    break
                except:
                    nb -= 1

        # discretize based on provided bin margins
        elif vars_to_discretize[i] == 'Bins':
            out = np.digitize(data_subset.ix[:, i], n_bins[i], right=True) - 1
            binning = n_bins[i]

        data_subset.ix[:, i] = out

        # replace NA variables with and special index (1+max) -
        # if it has not been done so automatically an in np.digitize
        data_subset.ix[:, i][data_subset.ix[:, i].isnull()] = data_subset.ix[:, i].max() + 1
        bins[i] = binning

    return data_subset, bins

Helper_functions.discretize

def discretize (dataframe, binRangesDict, plot=False):
    binnedDf = pd.DataFrame().reindex_like(dataframe)

    binCountsDict = copy.deepcopy(binRangesDict)  # copy trainingDfDiscterizedRangesDict
    for key in binCountsDict:
        for bin in binCountsDict[key]:
            del bin[:]
            bin.append(0)

    for varName in binRangesDict.keys():
        # load discretized ranges belonging to varName in order to bin in
        discreteRanges = binRangesDict.get(varName)

        index = 0
        for item1 in dataframe[varName]:
            for i in range(len(discreteRanges)):
                binRange = discreteRanges[i]

                ############ bin training data #############
                if i==0: # if this is first bin then bin numbers larger or equal than min num and less or equal than max num (basically, include min num)
                    if binRange[0] <= item1 <= binRange[1]:
                        # print item1,' is binned within ',binRange
                        binnedDf.iloc[index][varName] = i
                        binCountsDict[varName][i][0] += 1

                else: # if not first bin bin numbers less or equal to max num
                    if binRange[0] < item1 <= binRange[1]:
                        # print item1,' is binned within ',binRange
                        binnedDf.iloc[index][varName] = i
                        binCountsDict[varName][i][0] += 1

                # catch values outside of range (smaller than min)
                if i == 0 and binRange[0] > item1:
                    # print 'the value ', item1, 'is smaller than the minimum bin', binRange[0]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

                # catch values outside of range (larger than max)
                if i == len(discreteRanges) - 1 and binRange[1] < item1:
                    # print 'the value ', item1, 'is larger than the maximum bin', binRange[1]
                    binnedDf.iloc[index][varName] = i
                    binCountsDict[varName][i][0] += 1

            index += 1

    binnedData = binnedDf.to_dict(orient='records') # a list of dictionaries

    return binnedData, binnedDf, binCountsDict
kallewesterling commented 1 year ago

@zackxconti I think I'd like your input on this quandary when you have a chance.