Closed cdliang11 closed 4 months ago
Good job! BTW, have you ever compared the extracted embedding of the same audio by using wespeaker cli and 3d-speaker inference codes?
Good job! BTW, have you ever compared the extracted embedding of the same audio by using wespeaker cli and 3d-speaker inference codes?
The outputs of both are the same:
wespeaker-cli-eres2net:
wespeaker --task embedding --audio_file ../1000003_0f90da0d.wav --output_file ab.txt --eres2net
tensor([ 1.0719e+02, -1.0707e+02, -9.8385e+01, 1.5679e+02, 5.4318e+01,
-1.1169e+02, -1.2111e+02, 3.1424e-01, 7.0458e+01, 8.3888e+01,
5.1099e+00, 3.4183e+01, 1.1260e+02, 6.9974e+01, 6.1991e+00,
-3.4353e+01, 5.5698e+01, -6.6445e+01, -8.8395e+01, 6.2623e+01,
-6.8864e+01, -1.2985e+00, 5.3567e+01, 2.5563e+01, -2.0699e+02,
9.1679e+01, -6.3016e-01, 2.4147e+02, 4.2233e+01, -1.7257e+01,
1.6558e+02, 1.0798e+02, 1.5756e+02, -8.8241e+00, -5.4778e+01,
2.0347e+02, 1.4896e+02, 4.6032e+01, -6.0131e+01, 9.9032e+01,
7.3979e+01, -7.9786e+00, 2.3977e+02, 4.6933e+01, -3.5754e+01,
1.1774e+02, 2.9605e+01, -7.2195e+01, 2.2944e+01, -1.0759e+02,
-1.8584e-01, 8.5124e+00, -1.0233e+02, -2.0494e+02, -5.4285e+01,
-2.4525e+01, -9.2597e-01, -5.0375e+01, 1.8645e+02, 4.8976e+01,
2.4745e+01, -6.8617e+01, -1.1854e+02, -1.5248e+02, -1.2271e+02,
6.0507e+00, -3.9714e+01, -4.4913e+01, 9.8285e+01, -1.8834e+02,
1.3510e+02, 2.0361e+02, 3.5103e+02, 1.1984e+02, -9.6498e+01,
1.7820e+02, 2.3259e+02, 5.8027e+01, 2.2121e+02, -2.7586e+01,
-3.5794e+01, -1.8638e+02, 1.2937e+02, 1.8957e+02, 5.7769e+01,
-1.1921e+02, -1.1996e+02, -6.6017e+01, -9.6590e+01, 9.4987e+01,
1.0255e+01, -1.3291e+02, -5.8390e+01, -3.2232e+01, 8.8306e+01,
2.2555e+02, 1.3622e+02, 1.4743e+02, 1.3432e+02, -1.7218e+02,
1.7665e+02, -1.0665e+02, 8.9623e+01, 1.0677e+01, -2.4234e+01,
2.9356e+01, 7.2101e+00, -1.4630e+01, -1.4412e+02, -4.6604e+01,
1.4735e+01, -9.4617e+01, 3.1055e+01, -4.9040e+01, -1.9873e+01,
-3.0411e+01, 9.5236e+01, -1.2689e+02, 1.0485e+00, -6.4292e+01,
7.5631e+00, -7.3659e+01, -3.9583e+01, -9.1936e+01, -2.2508e+02,
9.5458e+01, 1.9162e+01, 8.8482e+01, 4.7615e+01, 8.4550e+01,
-3.1497e+01, -1.7538e+01, 6.8801e+01, -5.8801e+01, 9.0267e+01,
-1.4821e+01, 1.1426e+02, 9.5482e+00, 1.0220e+02, -1.2529e+01,
1.6309e+02, 5.3701e+01, -6.3783e+01, 3.1426e+01, 1.3175e+01,
1.9326e+02, -4.6129e+01, -5.3303e+01, -1.2275e+02, -1.7840e+02,
3.1950e+01, 7.6781e+01, -8.5490e+01, -5.9349e+01, 1.0050e+02,
-1.4480e+01, -1.8693e+02, 1.1561e+02, -8.3823e+00, -3.5588e+01,
-4.3535e+01, -9.2998e+01, -6.4526e+01, 1.0525e+02, -1.7919e+02,
2.7628e+01, 4.6942e+01, 5.0252e+01, 1.1318e+02, 2.0096e+01,
1.8756e+01, -6.4313e+00, 1.6498e+02, 2.4491e+02, 7.3965e+01,
6.0643e+01, 9.2237e+01, -4.0097e+01, 3.6914e+01, -8.0463e+01,
-5.4249e+01, 1.7363e+02, 1.0635e+02, -1.6970e+01, -5.3604e+00,
-5.5160e+01, 1.0349e+02, 4.6393e+01, 2.6735e+02, -7.6920e+01,
-5.0864e+01, 1.0435e+02])
3d-speaker-eres2net:
python speakerlab/bin/infer_sv.py --model_id damo/speech_eres2net_sv_zh-cn_16k-common --wavs ../1000003_0f90da0d.wav
[[ 1.07185570e+02 -1.07065971e+02 -9.83850021e+01 1.56792068e+02
5.43182678e+01 -1.11685722e+02 -1.21107147e+02 3.14234644e-01
7.04579697e+01 8.38880310e+01 5.10990715e+00 3.41827049e+01
1.12597008e+02 6.99740143e+01 6.19912004e+00 -3.43529053e+01
5.56977997e+01 -6.64450684e+01 -8.83945847e+01 6.26228409e+01
-6.88637924e+01 -1.29852068e+00 5.35669670e+01 2.55627708e+01
-2.06987167e+02 9.16793060e+01 -6.30156279e-01 2.41465576e+02
4.22328262e+01 -1.72566624e+01 1.65576233e+02 1.07984520e+02
1.57559677e+02 -8.82411480e+00 -5.47780380e+01 2.03471222e+02
1.48955048e+02 4.60318527e+01 -6.01313744e+01 9.90318298e+01
7.39785385e+01 -7.97857380e+00 2.39768631e+02 4.69331627e+01
-3.57535095e+01 1.17741989e+02 2.96053848e+01 -7.21953201e+01
2.29440022e+01 -1.07586067e+02 -1.85831785e-01 8.51243877e+00
-1.02329231e+02 -2.04937988e+02 -5.42851334e+01 -2.45249443e+01
-9.25973237e-01 -5.03754272e+01 1.86448975e+02 4.89762955e+01
2.47452126e+01 -6.86167679e+01 -1.18543961e+02 -1.52475616e+02
-1.22706100e+02 6.05067778e+00 -3.97143860e+01 -4.49127655e+01
9.82851944e+01 -1.88340973e+02 1.35097443e+02 2.03611725e+02
3.51030365e+02 1.19843636e+02 -9.64978485e+01 1.78199295e+02
2.32588028e+02 5.80274467e+01 2.21208496e+02 -2.75861969e+01
-3.57936249e+01 -1.86382172e+02 1.29372208e+02 1.89571442e+02
5.77693710e+01 -1.19212891e+02 -1.19962173e+02 -6.60174255e+01
-9.65898743e+01 9.49873276e+01 1.02552996e+01 -1.32910324e+02
-5.83902588e+01 -3.22321739e+01 8.83063736e+01 2.25545563e+02
1.36215103e+02 1.47429153e+02 1.34316086e+02 -1.72181870e+02
1.76649948e+02 -1.06649170e+02 8.96228561e+01 1.06770563e+01
-2.42341442e+01 2.93561916e+01 7.21006012e+00 -1.46295309e+01
-1.44118622e+02 -4.66039047e+01 1.47351255e+01 -9.46167679e+01
3.10549259e+01 -4.90395889e+01 -1.98732834e+01 -3.04108105e+01
9.52356033e+01 -1.26886902e+02 1.04850996e+00 -6.42918854e+01
7.56308317e+00 -7.36591492e+01 -3.95830841e+01 -9.19360580e+01
-2.25080139e+02 9.54575348e+01 1.91621723e+01 8.84821472e+01
4.76152725e+01 8.45503998e+01 -3.14971561e+01 -1.75380783e+01
6.88007965e+01 -5.88005295e+01 9.02667236e+01 -1.48207588e+01
1.14256233e+02 9.54815102e+00 1.02197952e+02 -1.25286541e+01
1.63087372e+02 5.37007637e+01 -6.37832489e+01 3.14255753e+01
1.31752663e+01 1.93255798e+02 -4.61294022e+01 -5.33031578e+01
-1.22745026e+02 -1.78399048e+02 3.19499016e+01 7.67812576e+01
-8.54898834e+01 -5.93485184e+01 1.00497589e+02 -1.44797382e+01
-1.86931091e+02 1.15612434e+02 -8.38229370e+00 -3.55880775e+01
-4.35352020e+01 -9.29980316e+01 -6.45255356e+01 1.05250122e+02
-1.79191391e+02 2.76280918e+01 4.69419022e+01 5.02516289e+01
1.13177498e+02 2.00961781e+01 1.87558441e+01 -6.43125820e+00
1.64980484e+02 2.44907242e+02 7.39653702e+01 6.06430168e+01
9.22372971e+01 -4.00973434e+01 3.69143677e+01 -8.04630508e+01
-5.42494392e+01 1.73629440e+02 1.06354744e+02 -1.69697266e+01
-5.36038399e+00 -5.51601143e+01 1.03492821e+02 4.63934517e+01
2.67350281e+02 -7.69200592e+01 -5.08637085e+01 1.04347771e+02]]
wespeaker-cli-campplus
wespeaker --task embedding --audio_file ../1000003_0f90da0d.wav --output_file ab.txt --campplus
tensor([-0.4085, 0.5056, 0.5164, 0.1142, 0.2405, -1.1340, -1.0937, -2.8552,
0.0684, 0.1350, 0.7184, 0.4421, 2.2673, -1.0266, -0.5091, -0.1179,
1.3293, -1.6358, 1.4727, -0.0701, -1.5125, 2.8412, -0.8806, 1.1905,
0.3311, -0.2748, -0.5990, -0.5975, 1.4531, 1.1028, -1.5573, -0.1814,
-0.3212, 0.5015, -0.2099, -1.3896, -1.4281, 0.8482, 1.4982, 0.5027,
0.4770, 0.4092, -0.0815, 0.6415, -1.3073, 1.5903, -0.5586, 1.7382,
-0.7740, 0.5523, -0.5069, 1.0383, 0.9142, 0.4275, 0.2864, 0.5127,
-0.1386, -1.4316, 1.7102, -1.0759, 0.8505, -2.1463, 0.9324, -0.6005,
-0.3228, -1.0593, -2.8073, -0.7224, 0.9778, -0.2866, 0.7632, -1.2476,
1.2914, -1.0816, 0.6496, 1.2780, -0.0391, 2.3277, -0.0749, -1.0742,
0.1455, -0.8960, -1.5815, -0.4504, -0.6127, 0.4211, 0.1415, -0.1980,
0.6075, -0.1637, 1.0157, -0.6854, 1.2261, -0.3732, -1.7594, -0.3167,
0.0160, 1.1844, 1.4816, -0.6894, 0.3778, -0.6784, 0.5630, 0.5130,
1.2627, -0.3130, -0.7394, 0.6165, -0.4759, 0.6413, 1.8477, 1.1122,
-0.4327, 0.2264, -1.0737, -0.5576, 0.4124, 0.1883, 0.4427, -0.7585,
0.2359, -0.5652, 0.0268, 0.7559, 0.7293, -0.5219, 0.9527, -0.8250,
-0.6706, -0.2563, -0.1923, -0.5686, 1.0763, -0.9233, -1.4972, 0.6783,
-0.5416, 0.3079, 0.5045, -0.9913, -0.8806, -0.0645, 0.1034, -1.0668,
0.8099, -1.7790, -0.6506, -0.5829, -0.8696, -0.1689, 0.4327, 1.0911,
0.1510, -0.8487, -0.4323, 1.0686, -0.7400, 1.5168, -0.4868, 0.1017,
0.2029, 0.1093, -0.4116, 0.6395, -0.1783, 0.5028, 0.6391, -1.8752,
0.0061, -1.4073, 0.7114, 0.5963, 0.2454, 0.7059, 1.4148, -0.1019,
1.2026, 0.9365, 0.6566, 0.4939, 0.8144, 0.1156, 1.1940, -1.5440,
1.5463, -0.4443, -0.7009, 0.6040, 1.0413, 0.4659, -1.1119, 0.8619])
3d-speaker-campplus
python speakerlab/bin/infer_sv.py --model_id damo/speech_campplus_sv_zh-cn_16k-common --wavs ../1000003_0f90da0d.wav
[[-0.4085187 0.50557995 0.51636136 0.1142143 0.2404558 -1.1339788
-1.0937457 -2.8552299 0.06838006 0.13499458 0.71838266 0.44210368
2.2673156 -1.0265858 -0.50908566 -0.11793047 1.3293266 -1.6357919
1.4727495 -0.07005835 -1.51254 2.8411684 -0.8805871 1.1905364
0.3311209 -0.2748037 -0.5990227 -0.5974755 1.4530792 1.1028372
-1.5573118 -0.18140262 -0.32122046 0.5014688 -0.20990026 -1.3896389
-1.4280965 0.84823036 1.4982125 0.5026547 0.4770487 0.4092163
-0.08150318 0.64153427 -1.3073081 1.5903411 -0.55855393 1.7382021
-0.7740348 0.5523105 -0.50692123 1.0383366 0.9142423 0.4274579
0.28643748 0.5127196 -0.13859004 -1.4316454 1.71023 -1.0759138
0.850508 -2.1463156 0.9324371 -0.6005027 -0.32279783 -1.0592827
-2.8073277 -0.72237927 0.9778304 -0.28658646 0.76317513 -1.2475644
1.29144 -1.0815903 0.6496 1.2780393 -0.03912735 2.327661
-0.07493195 -1.0741758 0.14549455 -0.89600277 -1.5814674 -0.4504004
-0.61274517 0.42109856 0.14153782 -0.19801521 0.6075355 -0.16368878
1.0157286 -0.6853902 1.2261169 -0.37317002 -1.7594169 -0.31674182
0.01597187 1.1843703 1.4816221 -0.6894153 0.3778354 -0.6784406
0.562997 0.5129908 1.262701 -0.31300706 -0.7393719 0.6164677
-0.47587308 0.6412958 1.8476795 1.1121718 -0.43268675 0.22637391
-1.0736793 -0.55764985 0.4124422 0.18827613 0.4426846 -0.7585005
0.23592305 -0.5652432 0.02683749 0.7558515 0.72929215 -0.5218993
0.95271045 -0.8250192 -0.67063344 -0.25632948 -0.19230154 -0.56856936
1.0763328 -0.9233105 -1.4971652 0.67828107 -0.54156303 0.3079476
0.50454146 -0.9912834 -0.88064206 -0.06452227 0.10343862 -1.0667944
0.80994976 -1.7790337 -0.6505701 -0.5829258 -0.86960715 -0.1688903
0.4326726 1.0910878 0.15102836 -0.84867907 -0.43227464 1.0685582
-0.7399875 1.5167844 -0.48675457 0.10172446 0.20288429 0.1092979
-0.4115819 0.6395146 -0.1783466 0.50276726 0.63910335 -1.8752003
0.00609797 -1.4072868 0.7114489 0.596313 0.24544685 0.70587957
1.4148142 -0.10190775 1.2026131 0.9365389 0.65656376 0.49392414
0.8144008 0.11556987 1.1940337 -1.5439909 1.5463395 -0.44434774
-0.70085263 0.60402256 1.0413061 0.46588537 -1.1118577 0.8619109 ]]
Using the following code to convert the damo/eres2net model to wespeaker format:
import torch
def convert_model(path, output_path):
states = torch.load(path, map_location='cpu')
adapter_layers = ["layer3.0", "layer3.1", "layer3.2", "layer3.3", "layer3.4", "layer3.5",
"layer4.0", "layer4.1", "layer4.2"]
for key in adapter_layers:
states[f"{key}.conv2_1.weight"] = states.pop(f"{key}.convs.0.weight")
states[f"{key}.bn2_1.weight"] = states.pop(f"{key}.bns.0.weight")
states[f"{key}.bn2_1.bias"] = states.pop(f"{key}.bns.0.bias")
states[f"{key}.bn2_1.running_mean"] = states.pop(f"{key}.bns.0.running_mean")
states[f"{key}.bn2_1.running_var"] = states.pop(f"{key}.bns.0.running_var")
states[f"{key}.bn2_1.num_batches_tracked"] = states.pop(f"{key}.bns.0.num_batches_tracked")
states[f"{key}.convs.0.weight"] = states.pop(f"{key}.convs.1.weight")
states[f"{key}.bns.0.weight"] = states.pop(f"{key}.bns.1.weight")
states[f"{key}.bns.0.bias"] = states.pop(f"{key}.bns.1.bias")
states[f"{key}.bns.0.running_mean"] = states.pop(f"{key}.bns.1.running_mean")
states[f"{key}.bns.0.running_var"] = states.pop(f"{key}.bns.1.running_var")
states[f"{key}.bns.0.num_batches_tracked"] = states.pop(f"{key}.bns.1.num_batches_tracked")
states[f"{key}.convs.1.weight"] = states.pop(f"{key}.convs.2.weight")
states[f"{key}.bns.1.weight"] = states.pop(f"{key}.bns.2.weight")
states[f"{key}.bns.1.bias"] = states.pop(f"{key}.bns.2.bias")
states[f"{key}.bns.1.running_mean"] = states.pop(f"{key}.bns.2.running_mean")
states[f"{key}.bns.1.running_var"] = states.pop(f"{key}.bns.2.running_var")
states[f"{key}.bns.1.num_batches_tracked"] = states.pop(f"{key}.bns.2.num_batches_tracked")
torch.save(states, output_path)
if __name__ == "__main__":
convert_model("/Users/user01/code/wespeaker-cli/pre_model/eres2net_commom/avg_model.pt",
"/Users/user01/code/wespeaker-cli/pre_model/eres2net_commom/avg_model_convert.pt")
usags: