curr_class = self._rng.choice(
list(range(self._num_unique_classes)), p=current_probabilities
)
# Redraw class label if there are no samples left to be allocated from
# that class
if class_sizes[curr_class] == 0:
# Class got exhausted, set probabilities to 0
class_priors[:, curr_class] = 0
# Renormalize such that the probability sums to 1
row_sums = class_priors.sum(axis=1, keepdims=True)
class_priors = class_priors / row_sums
# Adjust the current_probabilities (it won't sum up to 1 otherwise)
current_probabilities = class_priors[current_partition_id]
continue
class_sizes[curr_class] -= 1
# Store sample index at the empty array cell
index = partition_id_to_left_to_allocate[current_partition_id]
client_indices[current_partition_id][index] = idx_list[curr_class][
class_sizes[curr_class]
]
break
Steps/Code to Reproduce
Load UNSW-NB15 dataset in a Pandas DataFrame, then
File ~/git/netanomaly-fl/.venv/lib/python3.10/site-packages/flwr_datasets/partitioner/inner_dirichlet_partitioner.py:234, in InnerDirichletPartitioner._determine_partition_id_to_indices_if_needed(self)
231 current_probabilities = class_priors[current_partition_id]
232 while True:
233 # curr_class = np.argmax(np.random.uniform() <= curr_prior)
--> 234 curr_class = self._rng.choice(
235 list(range(self._num_unique_classes)), p=current_probabilities
236 )
237 # Redraw class label if there are no samples left to be allocated from
238 # that class
239 if class_sizes[curr_class] == 0:
240 # Class got exhausted, set probabilities to 0
File numpy/random/_generator.pyx:824, in numpy.random._generator.Generator.choice()
Hi @admaio , thanks for opening this issue. Could it be that your dataset doesn't have enough instances of all classes to meet the alpha=0.2 requirement? Does it work if raising the alpha value?
Describe the bug
When the row_sums variable contains all zeros, the class_priors is a list of nans and returns a ValueError: probabilities contain NaN
This is the likely problematic part of the InnerDirichletPartitioner file
while True:
curr_class = np.argmax(np.random.uniform() <= curr_prior)
Steps/Code to Reproduce
Load UNSW-NB15 dataset in a Pandas DataFrame, then
dataset = Dataset.from_pandas(data)
innerdir_partitioner = InnerDirichletPartitioner( partition_sizes=[int(len(dataset)/2)]*2, partition_by="label", alpha=.2, shuffle=True, seed=3 )
innerdir_partitioner.dataset = dataset
partition = innerdir_partitioner.load_partition(partition_id=0)
Expected Results
A set of partitions.
Actual Results
It crashes with error
File ~/git/netanomaly-fl/.venv/lib/python3.10/site-packages/flwr_datasets/partitioner/inner_dirichlet_partitioner.py:118, in InnerDirichletPartitioner.load_partition(self, partition_id) 116 self._determine_num_unique_classes_if_needed() 117 self._alpha = self._initialize_alpha_if_needed(self._initial_alpha) --> 118 self._determine_partition_id_to_indices_if_needed() 119 return self.dataset.select(self._partition_id_to_indices[partition_id])
File ~/git/netanomaly-fl/.venv/lib/python3.10/site-packages/flwr_datasets/partitioner/inner_dirichlet_partitioner.py:234, in InnerDirichletPartitioner._determine_partition_id_to_indices_if_needed(self) 231 current_probabilities = class_priors[current_partition_id] 232 while True: 233 # curr_class = np.argmax(np.random.uniform() <= curr_prior) --> 234 curr_class = self._rng.choice( 235 list(range(self._num_unique_classes)), p=current_probabilities 236 ) 237 # Redraw class label if there are no samples left to be allocated from 238 # that class 239 if class_sizes[curr_class] == 0: 240 # Class got exhausted, set probabilities to 0
File numpy/random/_generator.pyx:824, in numpy.random._generator.Generator.choice()