Closed ypotdevin closed 3 years ago
This C++ code section might be relevant:
void DPEnsemble::train(DataSet *dataset)
{
…
// gradient-based data filtering
if(params->gradient_filtering) {
std::vector<int> reject_indices, remaining_indices;
for (int i=0; i<dataset->length; i++) {
double curr_grad = dataset->gradients[i];
if (curr_grad < -params->l2_threshold or curr_grad > params->l2_threshold) {
reject_indices.push_back(i);
} else {
remaining_indices.push_back(i);
}
}
LOG_INFO("GDF: {1} of {2} rows fulfill gradient criterion",
remaining_indices.size(), dataset->length);
if ((size_t) number_of_rows <= remaining_indices.size()) {
// we have enough samples that were not filtered out
if (!VERIFICATION_MODE) {
std::random_shuffle(remaining_indices.begin(), remaining_indices.end());
}
for(int i=0; i<number_of_rows; i++){
tree_indices.push_back(remaining_indices[i]);
}
} else {
// we don't have enough -> take all samples that were not filtered out
// and fill up with randomly chosen and clipped filtered ones
for(auto filtered : remaining_indices){
tree_indices.push_back(filtered);
}
LOG_INFO("GDF: filling up with {1} rows (clipping those gradients)",
number_of_rows - tree_indices.size());
if (!VERIFICATION_MODE) {
std::random_shuffle(reject_indices.begin(), reject_indices.end());
}
int reject_index = 0;
for(int i=tree_indices.size(); i<number_of_rows; i++){
int curr_index = reject_indices[reject_index++];
dataset->gradients[curr_index] = clamp(dataset->gradients[curr_index],
-params->l2_threshold, params->l2_threshold);
tree_indices.push_back(curr_index);
}
}
} else {
// no GDF, just randomly select <number_of_rows> rows.
// Note, this causes the leaves to be clipped after building the tree.
tree_indices = vector<int>(dataset->length);
std::iota(std::begin(tree_indices), std::end(tree_indices), 0);
if (!VERIFICATION_MODE) {
std::random_shuffle(tree_indices.begin(), tree_indices.end());
}
tree_indices = std::vector<int>(tree_indices.begin(), tree_indices.begin() + number_of_rows);
}
QUESTION: What does »// Note, this causes the leaves to be clipped after building the tree.« mean? When/where does clipping take effect?
Filter -> Allocate -> Put filtered data back into pool -> repeat But keep in mind, that clipping might be required for the last couple of trees, because the data pool might be not large enough to provide sufficient data points for each tree.