[cli] support campplus_200k and eres2net_200k models of damo

cdliang11 commented 4 months ago

usags:

wespeaker --eres2net --task embedding --audio_file ../1000003_0f90da0d.wav --output_file test.txt
wespeaker --campplus --task embedding --audio_file ../1000003_0f90da0d.wav --output_file test.txt

JiJiJiang commented 4 months ago

Good job! BTW, have you ever compared the extracted embedding of the same audio by using wespeaker cli and 3d-speaker inference codes?

cdliang11 commented 4 months ago

Good job! BTW, have you ever compared the extracted embedding of the same audio by using wespeaker cli and 3d-speaker inference codes?

The outputs of both are the same:

eres2net

wespeaker-cli-eres2net: 
wespeaker --task embedding --audio_file ../1000003_0f90da0d.wav --output_file ab.txt  --eres2net
tensor([ 1.0719e+02, -1.0707e+02, -9.8385e+01,  1.5679e+02,  5.4318e+01,
    -1.1169e+02, -1.2111e+02,  3.1424e-01,  7.0458e+01,  8.3888e+01,
     5.1099e+00,  3.4183e+01,  1.1260e+02,  6.9974e+01,  6.1991e+00,
    -3.4353e+01,  5.5698e+01, -6.6445e+01, -8.8395e+01,  6.2623e+01,
    -6.8864e+01, -1.2985e+00,  5.3567e+01,  2.5563e+01, -2.0699e+02,
     9.1679e+01, -6.3016e-01,  2.4147e+02,  4.2233e+01, -1.7257e+01,
     1.6558e+02,  1.0798e+02,  1.5756e+02, -8.8241e+00, -5.4778e+01,
     2.0347e+02,  1.4896e+02,  4.6032e+01, -6.0131e+01,  9.9032e+01,
     7.3979e+01, -7.9786e+00,  2.3977e+02,  4.6933e+01, -3.5754e+01,
     1.1774e+02,  2.9605e+01, -7.2195e+01,  2.2944e+01, -1.0759e+02,
    -1.8584e-01,  8.5124e+00, -1.0233e+02, -2.0494e+02, -5.4285e+01,
    -2.4525e+01, -9.2597e-01, -5.0375e+01,  1.8645e+02,  4.8976e+01,
     2.4745e+01, -6.8617e+01, -1.1854e+02, -1.5248e+02, -1.2271e+02,
     6.0507e+00, -3.9714e+01, -4.4913e+01,  9.8285e+01, -1.8834e+02,
     1.3510e+02,  2.0361e+02,  3.5103e+02,  1.1984e+02, -9.6498e+01,
     1.7820e+02,  2.3259e+02,  5.8027e+01,  2.2121e+02, -2.7586e+01,
    -3.5794e+01, -1.8638e+02,  1.2937e+02,  1.8957e+02,  5.7769e+01,
    -1.1921e+02, -1.1996e+02, -6.6017e+01, -9.6590e+01,  9.4987e+01,
     1.0255e+01, -1.3291e+02, -5.8390e+01, -3.2232e+01,  8.8306e+01,
     2.2555e+02,  1.3622e+02,  1.4743e+02,  1.3432e+02, -1.7218e+02,
     1.7665e+02, -1.0665e+02,  8.9623e+01,  1.0677e+01, -2.4234e+01,
     2.9356e+01,  7.2101e+00, -1.4630e+01, -1.4412e+02, -4.6604e+01,
     1.4735e+01, -9.4617e+01,  3.1055e+01, -4.9040e+01, -1.9873e+01,
    -3.0411e+01,  9.5236e+01, -1.2689e+02,  1.0485e+00, -6.4292e+01,
     7.5631e+00, -7.3659e+01, -3.9583e+01, -9.1936e+01, -2.2508e+02,
     9.5458e+01,  1.9162e+01,  8.8482e+01,  4.7615e+01,  8.4550e+01,
    -3.1497e+01, -1.7538e+01,  6.8801e+01, -5.8801e+01,  9.0267e+01,
    -1.4821e+01,  1.1426e+02,  9.5482e+00,  1.0220e+02, -1.2529e+01,
     1.6309e+02,  5.3701e+01, -6.3783e+01,  3.1426e+01,  1.3175e+01,
     1.9326e+02, -4.6129e+01, -5.3303e+01, -1.2275e+02, -1.7840e+02,
     3.1950e+01,  7.6781e+01, -8.5490e+01, -5.9349e+01,  1.0050e+02,
    -1.4480e+01, -1.8693e+02,  1.1561e+02, -8.3823e+00, -3.5588e+01,
    -4.3535e+01, -9.2998e+01, -6.4526e+01,  1.0525e+02, -1.7919e+02,
     2.7628e+01,  4.6942e+01,  5.0252e+01,  1.1318e+02,  2.0096e+01,
     1.8756e+01, -6.4313e+00,  1.6498e+02,  2.4491e+02,  7.3965e+01,
     6.0643e+01,  9.2237e+01, -4.0097e+01,  3.6914e+01, -8.0463e+01,
    -5.4249e+01,  1.7363e+02,  1.0635e+02, -1.6970e+01, -5.3604e+00,
    -5.5160e+01,  1.0349e+02,  4.6393e+01,  2.6735e+02, -7.6920e+01,
    -5.0864e+01,  1.0435e+02])

3d-speaker-eres2net:
python speakerlab/bin/infer_sv.py --model_id damo/speech_eres2net_sv_zh-cn_16k-common --wavs ../1000003_0f90da0d.wav
[[ 1.07185570e+02 -1.07065971e+02 -9.83850021e+01  1.56792068e+02
5.43182678e+01 -1.11685722e+02 -1.21107147e+02  3.14234644e-01
7.04579697e+01  8.38880310e+01  5.10990715e+00  3.41827049e+01
1.12597008e+02  6.99740143e+01  6.19912004e+00 -3.43529053e+01
5.56977997e+01 -6.64450684e+01 -8.83945847e+01  6.26228409e+01
-6.88637924e+01 -1.29852068e+00  5.35669670e+01  2.55627708e+01
-2.06987167e+02  9.16793060e+01 -6.30156279e-01  2.41465576e+02
4.22328262e+01 -1.72566624e+01  1.65576233e+02  1.07984520e+02
1.57559677e+02 -8.82411480e+00 -5.47780380e+01  2.03471222e+02
1.48955048e+02  4.60318527e+01 -6.01313744e+01  9.90318298e+01
7.39785385e+01 -7.97857380e+00  2.39768631e+02  4.69331627e+01
-3.57535095e+01  1.17741989e+02  2.96053848e+01 -7.21953201e+01
2.29440022e+01 -1.07586067e+02 -1.85831785e-01  8.51243877e+00
-1.02329231e+02 -2.04937988e+02 -5.42851334e+01 -2.45249443e+01
-9.25973237e-01 -5.03754272e+01  1.86448975e+02  4.89762955e+01
2.47452126e+01 -6.86167679e+01 -1.18543961e+02 -1.52475616e+02
-1.22706100e+02  6.05067778e+00 -3.97143860e+01 -4.49127655e+01
9.82851944e+01 -1.88340973e+02  1.35097443e+02  2.03611725e+02
3.51030365e+02  1.19843636e+02 -9.64978485e+01  1.78199295e+02
2.32588028e+02  5.80274467e+01  2.21208496e+02 -2.75861969e+01
-3.57936249e+01 -1.86382172e+02  1.29372208e+02  1.89571442e+02
5.77693710e+01 -1.19212891e+02 -1.19962173e+02 -6.60174255e+01
-9.65898743e+01  9.49873276e+01  1.02552996e+01 -1.32910324e+02
-5.83902588e+01 -3.22321739e+01  8.83063736e+01  2.25545563e+02
1.36215103e+02  1.47429153e+02  1.34316086e+02 -1.72181870e+02
1.76649948e+02 -1.06649170e+02  8.96228561e+01  1.06770563e+01
-2.42341442e+01  2.93561916e+01  7.21006012e+00 -1.46295309e+01
-1.44118622e+02 -4.66039047e+01  1.47351255e+01 -9.46167679e+01
3.10549259e+01 -4.90395889e+01 -1.98732834e+01 -3.04108105e+01
9.52356033e+01 -1.26886902e+02  1.04850996e+00 -6.42918854e+01
7.56308317e+00 -7.36591492e+01 -3.95830841e+01 -9.19360580e+01
-2.25080139e+02  9.54575348e+01  1.91621723e+01  8.84821472e+01
4.76152725e+01  8.45503998e+01 -3.14971561e+01 -1.75380783e+01
6.88007965e+01 -5.88005295e+01  9.02667236e+01 -1.48207588e+01
1.14256233e+02  9.54815102e+00  1.02197952e+02 -1.25286541e+01
1.63087372e+02  5.37007637e+01 -6.37832489e+01  3.14255753e+01
1.31752663e+01  1.93255798e+02 -4.61294022e+01 -5.33031578e+01
-1.22745026e+02 -1.78399048e+02  3.19499016e+01  7.67812576e+01
-8.54898834e+01 -5.93485184e+01  1.00497589e+02 -1.44797382e+01
-1.86931091e+02  1.15612434e+02 -8.38229370e+00 -3.55880775e+01
-4.35352020e+01 -9.29980316e+01 -6.45255356e+01  1.05250122e+02
-1.79191391e+02  2.76280918e+01  4.69419022e+01  5.02516289e+01
1.13177498e+02  2.00961781e+01  1.87558441e+01 -6.43125820e+00
1.64980484e+02  2.44907242e+02  7.39653702e+01  6.06430168e+01
9.22372971e+01 -4.00973434e+01  3.69143677e+01 -8.04630508e+01
-5.42494392e+01  1.73629440e+02  1.06354744e+02 -1.69697266e+01
-5.36038399e+00 -5.51601143e+01  1.03492821e+02  4.63934517e+01
2.67350281e+02 -7.69200592e+01 -5.08637085e+01  1.04347771e+02]]

campplus

wespeaker-cli-campplus
wespeaker --task embedding --audio_file ../1000003_0f90da0d.wav --output_file ab.txt  --campplus
tensor([-0.4085,  0.5056,  0.5164,  0.1142,  0.2405, -1.1340, -1.0937, -2.8552,
     0.0684,  0.1350,  0.7184,  0.4421,  2.2673, -1.0266, -0.5091, -0.1179,
     1.3293, -1.6358,  1.4727, -0.0701, -1.5125,  2.8412, -0.8806,  1.1905,
     0.3311, -0.2748, -0.5990, -0.5975,  1.4531,  1.1028, -1.5573, -0.1814,
    -0.3212,  0.5015, -0.2099, -1.3896, -1.4281,  0.8482,  1.4982,  0.5027,
     0.4770,  0.4092, -0.0815,  0.6415, -1.3073,  1.5903, -0.5586,  1.7382,
    -0.7740,  0.5523, -0.5069,  1.0383,  0.9142,  0.4275,  0.2864,  0.5127,
    -0.1386, -1.4316,  1.7102, -1.0759,  0.8505, -2.1463,  0.9324, -0.6005,
    -0.3228, -1.0593, -2.8073, -0.7224,  0.9778, -0.2866,  0.7632, -1.2476,
     1.2914, -1.0816,  0.6496,  1.2780, -0.0391,  2.3277, -0.0749, -1.0742,
     0.1455, -0.8960, -1.5815, -0.4504, -0.6127,  0.4211,  0.1415, -0.1980,
     0.6075, -0.1637,  1.0157, -0.6854,  1.2261, -0.3732, -1.7594, -0.3167,
     0.0160,  1.1844,  1.4816, -0.6894,  0.3778, -0.6784,  0.5630,  0.5130,
     1.2627, -0.3130, -0.7394,  0.6165, -0.4759,  0.6413,  1.8477,  1.1122,
    -0.4327,  0.2264, -1.0737, -0.5576,  0.4124,  0.1883,  0.4427, -0.7585,
     0.2359, -0.5652,  0.0268,  0.7559,  0.7293, -0.5219,  0.9527, -0.8250,
    -0.6706, -0.2563, -0.1923, -0.5686,  1.0763, -0.9233, -1.4972,  0.6783,
    -0.5416,  0.3079,  0.5045, -0.9913, -0.8806, -0.0645,  0.1034, -1.0668,
     0.8099, -1.7790, -0.6506, -0.5829, -0.8696, -0.1689,  0.4327,  1.0911,
     0.1510, -0.8487, -0.4323,  1.0686, -0.7400,  1.5168, -0.4868,  0.1017,
     0.2029,  0.1093, -0.4116,  0.6395, -0.1783,  0.5028,  0.6391, -1.8752,
     0.0061, -1.4073,  0.7114,  0.5963,  0.2454,  0.7059,  1.4148, -0.1019,
     1.2026,  0.9365,  0.6566,  0.4939,  0.8144,  0.1156,  1.1940, -1.5440,
     1.5463, -0.4443, -0.7009,  0.6040,  1.0413,  0.4659, -1.1119,  0.8619])

3d-speaker-campplus
python speakerlab/bin/infer_sv.py --model_id damo/speech_campplus_sv_zh-cn_16k-common --wavs ../1000003_0f90da0d.wav
[[-0.4085187   0.50557995  0.51636136  0.1142143   0.2404558  -1.1339788
  -1.0937457  -2.8552299   0.06838006  0.13499458  0.71838266  0.44210368
   2.2673156  -1.0265858  -0.50908566 -0.11793047  1.3293266  -1.6357919
   1.4727495  -0.07005835 -1.51254     2.8411684  -0.8805871   1.1905364
   0.3311209  -0.2748037  -0.5990227  -0.5974755   1.4530792   1.1028372
  -1.5573118  -0.18140262 -0.32122046  0.5014688  -0.20990026 -1.3896389
  -1.4280965   0.84823036  1.4982125   0.5026547   0.4770487   0.4092163
  -0.08150318  0.64153427 -1.3073081   1.5903411  -0.55855393  1.7382021
  -0.7740348   0.5523105  -0.50692123  1.0383366   0.9142423   0.4274579
   0.28643748  0.5127196  -0.13859004 -1.4316454   1.71023    -1.0759138
   0.850508   -2.1463156   0.9324371  -0.6005027  -0.32279783 -1.0592827
  -2.8073277  -0.72237927  0.9778304  -0.28658646  0.76317513 -1.2475644
   1.29144    -1.0815903   0.6496      1.2780393  -0.03912735  2.327661
  -0.07493195 -1.0741758   0.14549455 -0.89600277 -1.5814674  -0.4504004
  -0.61274517  0.42109856  0.14153782 -0.19801521  0.6075355  -0.16368878
   1.0157286  -0.6853902   1.2261169  -0.37317002 -1.7594169  -0.31674182
   0.01597187  1.1843703   1.4816221  -0.6894153   0.3778354  -0.6784406
   0.562997    0.5129908   1.262701   -0.31300706 -0.7393719   0.6164677
  -0.47587308  0.6412958   1.8476795   1.1121718  -0.43268675  0.22637391
  -1.0736793  -0.55764985  0.4124422   0.18827613  0.4426846  -0.7585005
   0.23592305 -0.5652432   0.02683749  0.7558515   0.72929215 -0.5218993
   0.95271045 -0.8250192  -0.67063344 -0.25632948 -0.19230154 -0.56856936
   1.0763328  -0.9233105  -1.4971652   0.67828107 -0.54156303  0.3079476
   0.50454146 -0.9912834  -0.88064206 -0.06452227  0.10343862 -1.0667944
   0.80994976 -1.7790337  -0.6505701  -0.5829258  -0.86960715 -0.1688903
   0.4326726   1.0910878   0.15102836 -0.84867907 -0.43227464  1.0685582
  -0.7399875   1.5167844  -0.48675457  0.10172446  0.20288429  0.1092979
  -0.4115819   0.6395146  -0.1783466   0.50276726  0.63910335 -1.8752003
   0.00609797 -1.4072868   0.7114489   0.596313    0.24544685  0.70587957
   1.4148142  -0.10190775  1.2026131   0.9365389   0.65656376  0.49392414
   0.8144008   0.11556987  1.1940337  -1.5439909   1.5463395  -0.44434774
  -0.70085263  0.60402256  1.0413061   0.46588537 -1.1118577   0.8619109 ]]

cdliang11 commented 4 months ago

Using the following code to convert the damo/eres2net model to wespeaker format:

import torch

def convert_model(path, output_path):
    states = torch.load(path, map_location='cpu')
    adapter_layers = ["layer3.0", "layer3.1", "layer3.2", "layer3.3", "layer3.4", "layer3.5",
                      "layer4.0", "layer4.1", "layer4.2"]
    for key in adapter_layers:
        states[f"{key}.conv2_1.weight"] = states.pop(f"{key}.convs.0.weight")
        states[f"{key}.bn2_1.weight"] = states.pop(f"{key}.bns.0.weight")
        states[f"{key}.bn2_1.bias"] = states.pop(f"{key}.bns.0.bias")
        states[f"{key}.bn2_1.running_mean"] = states.pop(f"{key}.bns.0.running_mean")
        states[f"{key}.bn2_1.running_var"] = states.pop(f"{key}.bns.0.running_var")
        states[f"{key}.bn2_1.num_batches_tracked"] = states.pop(f"{key}.bns.0.num_batches_tracked")

        states[f"{key}.convs.0.weight"] = states.pop(f"{key}.convs.1.weight")
        states[f"{key}.bns.0.weight"] = states.pop(f"{key}.bns.1.weight")
        states[f"{key}.bns.0.bias"] = states.pop(f"{key}.bns.1.bias")
        states[f"{key}.bns.0.running_mean"] = states.pop(f"{key}.bns.1.running_mean")
        states[f"{key}.bns.0.running_var"] = states.pop(f"{key}.bns.1.running_var")
        states[f"{key}.bns.0.num_batches_tracked"] = states.pop(f"{key}.bns.1.num_batches_tracked")

        states[f"{key}.convs.1.weight"] = states.pop(f"{key}.convs.2.weight")
        states[f"{key}.bns.1.weight"] = states.pop(f"{key}.bns.2.weight")
        states[f"{key}.bns.1.bias"] = states.pop(f"{key}.bns.2.bias")
        states[f"{key}.bns.1.running_mean"] = states.pop(f"{key}.bns.2.running_mean")
        states[f"{key}.bns.1.running_var"] = states.pop(f"{key}.bns.2.running_var")
        states[f"{key}.bns.1.num_batches_tracked"] = states.pop(f"{key}.bns.2.num_batches_tracked")

    torch.save(states, output_path)

if __name__ == "__main__":
    convert_model("/Users/user01/code/wespeaker-cli/pre_model/eres2net_commom/avg_model.pt",
                  "/Users/user01/code/wespeaker-cli/pre_model/eres2net_commom/avg_model_convert.pt")

wenet-e2e / wespeaker

[cli] support campplus_200k and eres2net_200k models of damo #281