ypotdevin / dpgbdt

Apache License 2.0
0 stars 0 forks source link

Feature: Rearrange gradient-based data filtering and data-to-tree allocation #9

Closed ypotdevin closed 3 years ago

ypotdevin commented 3 years ago

Filter -> Allocate -> Put filtered data back into pool -> repeat But keep in mind, that clipping might be required for the last couple of trees, because the data pool might be not large enough to provide sufficient data points for each tree.

ypotdevin commented 3 years ago

This C++ code section might be relevant:

dp_ensemble.cpp

void DPEnsemble::train(DataSet *dataset)
{
    …
    // gradient-based data filtering
    if(params->gradient_filtering) {
        std::vector<int> reject_indices, remaining_indices;
        for (int i=0; i<dataset->length; i++) {
            double curr_grad = dataset->gradients[i];
            if (curr_grad < -params->l2_threshold or curr_grad > params->l2_threshold) {
                reject_indices.push_back(i);
            } else {
                remaining_indices.push_back(i);
            }
        }
        LOG_INFO("GDF: {1} of {2} rows fulfill gradient criterion",
            remaining_indices.size(), dataset->length);

        if ((size_t) number_of_rows <= remaining_indices.size()) {
            // we have enough samples that were not filtered out
            if (!VERIFICATION_MODE) {
                std::random_shuffle(remaining_indices.begin(), remaining_indices.end());
            }
            for(int i=0; i<number_of_rows; i++){
                tree_indices.push_back(remaining_indices[i]);
            }
        } else {
            // we don't have enough -> take all samples that were not filtered out
            // and fill up with randomly chosen and clipped filtered ones
            for(auto filtered : remaining_indices){
                tree_indices.push_back(filtered);
            }
            LOG_INFO("GDF: filling up with {1} rows (clipping those gradients)",
                number_of_rows - tree_indices.size());
            if (!VERIFICATION_MODE) {
                std::random_shuffle(reject_indices.begin(), reject_indices.end());
            }
            int reject_index = 0;
            for(int i=tree_indices.size(); i<number_of_rows; i++){
                int curr_index = reject_indices[reject_index++];
                dataset->gradients[curr_index] = clamp(dataset->gradients[curr_index],
                    -params->l2_threshold, params->l2_threshold);
                tree_indices.push_back(curr_index);
            }
        }
    } else {
        // no GDF, just randomly select <number_of_rows> rows.
        // Note, this causes the leaves to be clipped after building the tree.
        tree_indices = vector<int>(dataset->length);
        std::iota(std::begin(tree_indices), std::end(tree_indices), 0);
        if (!VERIFICATION_MODE) {
            std::random_shuffle(tree_indices.begin(), tree_indices.end());
        }
        tree_indices = std::vector<int>(tree_indices.begin(), tree_indices.begin() + number_of_rows);
    }

QUESTION: What does »// Note, this causes the leaves to be clipped after building the tree.« mean? When/where does clipping take effect?