Closed vitalwarley closed 2 years ago
In configs/ms1mv3_r100.py we have
config.val_targets = ['lfw', 'cfp_fp', "agedb_30"]
This is the flow
class CallBackVerification(object):
def __init__(self, val_targets, rec_prefix, summary_writer=None, image_size=(112, 112)):
self.rank: int = distributed.get_rank()
self.highest_acc: float = 0.0
self.highest_acc_list: List[float] = [0.0] * len(val_targets)
self.ver_list: List[object] = []
self.ver_name_list: List[str] = []
if self.rank is 0:
self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size) # <-- here
self.summary_writer = summary_writer
where val_targets
is config.val_targets
.
init_dataset
calls load_bin
def init_dataset(self, val_targets, data_dir, image_size):
for name in val_targets:
path = os.path.join(data_dir, name + ".bin")
if os.path.exists(path):
data_set = verification.load_bin(path, image_size)
self.ver_list.append(data_set)
self.ver_name_list.append(name)
where load_bin
basically loads .bin
s in tensors
@torch.no_grad()
def load_bin(path, image_size):
try:
with open(path, 'rb') as f:
bins, issame_list = pickle.load(f) # py2
except UnicodeDecodeError as e:
with open(path, 'rb') as f:
bins, issame_list = pickle.load(f, encoding='bytes') # py3
data_list = []
for flip in [0, 1]:
data = torch.empty((len(issame_list) * 2, 3, image_size[0], image_size[1]))
data_list.append(data)
for idx in range(len(issame_list) * 2):
_bin = bins[idx]
img = mx.image.imdecode(_bin)
if img.shape[1] != image_size[0]:
img = mx.image.resize_short(img, image_size[0])
img = nd.transpose(img, axes=(2, 0, 1))
for flip in [0, 1]:
if flip == 1:
img = mx.ndarray.flip(data=img, axis=2)
data_list[flip][idx][:] = torch.from_numpy(img.asnumpy())
if idx % 1000 == 0:
print('loading bin', idx)
print(data_list[0].shape)
return data_list, issame_list
def __call__(self, num_update, backbone: torch.nn.Module):
if self.rank is 0 and num_update > 0:
backbone.eval()
self.ver_test(backbone, num_update)
backbone.train()
where ver_test
is
def ver_test(self, backbone: torch.nn.Module, global_step: int):
results = []
for i in range(len(self.ver_list)):
acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(
self.ver_list[i], backbone, 10, 10)
logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm))
logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2))
self.summary_writer: SummaryWriter
self.summary_writer.add_scalar(tag=self.ver_name_list[i], scalar_value=acc2, global_step=global_step, )
if acc2 > self.highest_acc_list[i]:
self.highest_acc_list[i] = acc2
logging.info(
'[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i]))
results.append(acc2)
This is what we see in the training logs from insightface. verification.test
is called for each dataset.
verification.test
we don't have a classification task, but instead face verification.
for i in range(len(data_list)):
data = data_list[i]
embeddings = None
ba = 0
while ba < data.shape[0]:
bb = min(ba + batch_size, data.shape[0])
count = bb - ba
_data = data[bb - batch_size: bb]
time0 = datetime.datetime.now()
img = ((_data / 255) - 0.5) / 0.5
net_out: torch.Tensor = backbone(img)
_embeddings = net_out.detach().cpu().numpy()
time_now = datetime.datetime.now()
diff = time_now - time0
time_consumed += diff.total_seconds()
if embeddings is None:
embeddings = np.zeros((data.shape[0], _embeddings.shape[1]))
embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :]
ba = bb
embeddings_list.append(embeddings)
self.ver_name_list[i]
) for [no_flip, flip] images): normalize the input (which can be batched; they used bs=10) and pass it into the backbone. In the end, we have an embeddings list composed of arrays of no_flip images embeddings and flip image embeddings.np.linalg.norm(_em)
, where _em
seems to be a 1x512 vector). This norm is aggregated and averaged outside the loop (_xnorm /= _xnorm_count
). That is, _xnorm
is the sum of all embeddings for the given dataset, and _xnorm_count
the total of norms computed.sklearn.preprocessing.normalize
). embeddings = embeddings_list[0] + embeddings_list[1]
embeddings = sklearn.preprocessing.normalize(embeddings)
At last, this happens
_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
acc2, std2 = np.mean(accuracy), np.std(accuracy)
return acc1, std1, acc2, std2, _xnorm, embeddings_list # acc1 = std1 = 0.0, but aren't used...
evaluate
, they compute the Accuracy, FAR (False Acceptance Rate), and VAL (?, I don't know what it is), but only Accuracy is used (see the previous comment, step 5.4)
6.1. Accuracy is computed as follows thresholds = np.arange(0, 4, 0.01)
embeddings1 = embeddings[0::2]
embeddings2 = embeddings[1::2]
tpr, fpr, accuracy = calculate_roc(thresholds,
embeddings1,
embeddings2,
np.asarray(actual_issame),
nrof_folds=nrof_folds,
pca=pca)
wherein calculate_roc
an LFold evaluation takes place.
nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
nrof_thresholds = len(thresholds)
k_fold = LFold(n_splits=nrof_folds, shuffle=False)
tprs = np.zeros((nrof_folds, nrof_thresholds))
fprs = np.zeros((nrof_folds, nrof_thresholds))
accuracy = np.zeros((nrof_folds))
indices = np.arange(nrof_pairs)
if pca == 0: # pca == 0, then the code below is executed
diff = np.subtract(embeddings1, embeddings2)
dist = np.sum(np.square(diff), 1)
and
for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
...
# Find the best threshold for the fold
acc_train = np.zeros((nrof_thresholds))
for threshold_idx, threshold in enumerate(thresholds):
_, _, acc_train[threshold_idx] = calculate_accuracy(
threshold, dist[train_set], actual_issame[train_set])
best_threshold_index = np.argmax(acc_train)
for threshold_idx, threshold in enumerate(thresholds):
tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy(
threshold, dist[test_set],
actual_issame[test_set])
_, _, accuracy[fold_idx] = calculate_accuracy(
thresholds[best_threshold_index], dist[test_set],
actual_issame[test_set])
tpr = np.mean(tprs, 0)
fpr = np.mean(fprs, 0)
return tpr, fpr, accuracy
thresholds
range from 0 to 4 because of gradient clipping (#10).best_threshold_index
, the index of the highest accuracy between folds.At last, remember that we have at the end of test
embeddings = embeddings_list[0].copy()
embeddings = sklearn.preprocessing.normalize(embeddings)
acc1 = 0.0
std1 = 0.0
embeddings = embeddings_list[0] + embeddings_list[1]
embeddings = sklearn.preprocessing.normalize(embeddings)
print(embeddings.shape)
print('infer time', time_consumed)
_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
acc2, std2 = np.mean(accuracy), np.std(accuracy)
return acc1, std1, acc2, std2, _xnorm, embeddings_list
This accuracy
is of shape (nfolds,).
Since these validation sets are for face verification, each sample must be a pair of images. Below is how they did
def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0):
# Calculate evaluation metrics
thresholds = np.arange(0, 4, 0.01)
embeddings1 = embeddings[0::2]
embeddings2 = embeddings[1::2]
...
where embeddings1
and embeddings2
are the embeddings for each pair. I think this can be improved by loading the data differently from how they did. That is, instead of populating one array with all images, I will populate two arrays, each for an image of the pair.
In DDP all external validation datasets are copied to each process, so the memory goes brrr and the training becomes too slow. To solve it, I moved cfp_fp and agedb_30 from validation to test.
The
.bin
s are in MS1M_v3. How they are read and used is here and here.