MTG / essentia

C++ library for audio and music analysis, description and synthesis, including Python bindings
http://essentia.upf.edu
GNU Affero General Public License v3.0
2.83k stars 530 forks source link

implement spectrum-to-cent scale #481

Closed georgid closed 7 years ago

georgid commented 8 years ago

Triangular filterbank to compute cent-scale from frequency bands: python code to compute it:

import essentia
import essentia.standard
from essentia.standard import *

import matplotlib.pyplot as plt
import numpy as np

import scipy.io

frequencyBands_default = [164.000000000000, 164.949851489592, 165.905204307551, 166.866090316347, 167.832541562989, 168.804590280096, 169.782268886970, 170.765609990681, 171.754646387148, 172.749411062240, 173.749937192872, 174.756258148110, 175.768407490290, 176.786418976130, 177.810326557862, 178.840164384360, 179.875966802283, 180.917768357215, 181.965603794823, 183.019508062011, 184.079516308091, 185.145663885948, 186.217986353225, 187.296519473508, 188.381299217515, 189.472361764301, 190.569743502461, 191.673481031345, 192.783611162277, 193.900170919785, 195.023197542835, 196.152728486073, 197.288801421072, 198.431454237594, 199.580725044846, 200.736652172759, 201.899274173258, 203.068629821557, 204.244758117443, 205.427698286584, 206.617489781833, 207.814172284546, 209.017785705904, 210.228370188245, 211.445966106401, 212.670614069050, 213.902354920061, 215.141229739866, 216.387279846823, 217.640546798598, 218.901072393550, 220.168898672125, 221.444067918256, 222.726622660776, 224.016605674837, 225.314059983334, 226.619028858341, 227.931555822554, 229.251684650743, 230.579459371211, 231.914924267266, 233.258123878692, 234.609103003237, 235.967906698112, 237.334580281484, 238.709169333995, 240.091719700282, 241.482277490501, 242.880889081869, 244.287601120209, 245.702460521507, 247.125514473474, 248.556810437125, 249.996396148355, 251.444319619539, 252.900629141124, 254.365373283248, 255.838600897355, 257.320361117827, 258.810703363620, 260.309677339913, 261.817333039770, 263.333720745798, 264.858891031835, 266.392894764629, 267.935783105536, 269.487607512230, 271.048419740412, 272.618271845546, 274.197216184585, 275.785305417723, 277.382592510153, 278.989130733827, 280.604973669238, 282.230175207205, 283.864789550670, 285.508871216507, 287.162475037340, 288.825656163371, 290.498470064218, 292.180972530771, 293.873219677043, 295.575267942051, 297.287174091692, 299.008995220640, 300.740788754248, 302.482612450462, 304.234524401752, 305.996583037046, 307.768847123678, 309.551375769351, 311.344228424105, 313.147464882302, 314.961145284621, 316.785330120061, 318.620080227959, 320.465456800022, 322.321521382364, 324.188335877562, 326.065962546717, 327.954464011535, 329.853903256410, 331.764343630531, 333.685848849987, 335.618482999900, 337.562310536558, 339.517396289565, 341.483805464003, 343.461603642610, 345.450856787960, 347.451631244671, 349.463993741615, 351.488011394140, 353.523751706313, 355.571282573168, 357.630672282973, 359.701989519508, 361.785303364350, 363.880683299185, 365.988199208120, 368.107921380015, 370.239920510828, 372.384267705971, 374.541034482684, 376.710292772419, 378.892114923238, 381.086573702228, 383.293742297926, 385.513694322760, 387.746503815507, 389.992245243757, 392.250993506403, 394.522823936134, 396.807812301949, 399.106034811684, 401.417568114555, 403.742489303712, 406.080875918813, 408.432805948607, 410.798357833536, 413.177610468353, 415.570643204750, 417.977535854010, 420.398368689661, 422.833222450159, 425.282178341581, 427.745318040327, 430.222723695854, 432.714477933407, 435.220663856778, 437.741365051080, 440.276665585528, 442.826650016253, 445.391403389112, 447.971011242529, 450.565559610349, 453.175135024706, 455.799824518908, 458.439715630340, 461.094896403385, 463.765455392360, 466.451481664467, 469.153064802767, 471.870294909165, 474.603262607418, 477.352059046154, 480.116775901914, 482.897505382208, 485.694340228595, 488.507373719767, 491.336699674671, 494.182412455628, 497.044606971488, 499.923378680788, 502.818823594944, 505.731038281446, 508.660119867083, 511.606166041178, 514.569275058851, 517.549545744294, 520.547077494067, 523.561970280410, 526.594324654583, 529.644241750215, 532.711823286678, 535.797171572482, 538.900389508684, 542.021580592320, 545.160848919859, 548.318299190673, 551.494036710530, 554.688167395106, 557.900797773515, 561.132034991865, 564.381986816832, 567.650761639251, 570.938468477731, 574.245216982297, 577.571117438037, 580.916280768791, 584.280818540841, 587.664842966639, 591.068466908544, 594.491803882591, 597.934968062273, 601.398074282351, 604.881238042684, 608.384575512079, 611.908203532169, 615.452239621305, 619.016801978480, 622.602009487270, 626.207981719795, 629.834838940714, 633.482702111229, 637.151692893125, 640.841933652822, 644.553547465461, 648.286658119008, 652.041390118378, 655.817868689593, 659.616219783956, 663.436570082252, 667.279046998973, 671.143778686566, 675.030894039709, 678.940522699611, 682.872795058333, 686.827842263136, 690.805796220859, 694.806789602316, 698.830955846719, 702.878429166131, 706.949344549944, 711.043837769374, 715.162045381997, 719.304104736298, 723.470153976256, 727.660332045948, 731.874778694183, 736.113634479165, 740.377040773182, 744.665139767315, 748.978074476186, 753.315988742727, 757.679027242973, 762.067335490894, 766.481059843242, 770.920347504436, 775.385346531470, 779.876205838851, 784.393075203564, 788.936105270073, 793.505447555337, 798.101254453871, 802.723679242821, 807.372876087085, 812.049000044446, 816.752207070748, 821.482654025098, 826.240498675094, 831.025899702090, 835.839016706488, 840.680010213057, 845.549041676293, 850.446273485799, 855.371868971702, 860.325992410102, 865.308809028547, 870.320485011550, 875.361187506125, 880.431084627365, 885.530345464048, 890.659140084277, 895.817639541151, 901.006015878472, 906.224442136479, 911.473092357623, 916.752141592370, 922.061765905039, 927.402142379675, 932.773449125955, 938.175865285124, 943.609571035977, 949.074747600864, 954.571577251732, 960.100243316210, 965.660930183717, 971.253823311615, 976.879109231396, 982.536975554898, 988.227610980566, 993.951205299746, 999.707949403010, 1005.49803528653, 1011.32165605847, 1017.17900594544, 1023.07028029897, 1028.99567560201, 1034.95538947551, 1040.94962068499, 1046.97856914717, 1053.04243593665, 1059.14142329261, 1065.27573462555, 1071.44557452408, 1077.65114876176, 1083.89266430390, 1090.17032931455, 1096.48435316338, 1102.83494643268, 1109.22232092441, 1115.64668966720, 1122.10826692353, 1128.60726819683, 1135.14391023866, 1141.71841105598, 1148.33098991839, 1154.98186736546, 1161.67126521404, 1168.39940656575, 1175.16651581431, 1181.97281865312, 1188.81854208272, 1195.70391441840, 1202.62916529779, 1209.59452568855, 1216.60022789603, 1223.64650557106, 1230.73359371771, 1237.86172870114, 1245.03114825550, 1252.24209149183, 1259.49479890606, 1266.78951238699, 1274.12647522443, 1281.50593211724, 1288.92812918152, 1296.39331395883, 1303.90173542444, 1311.45364399563, 1319.04929154001, 1326.68893138400, 1334.37281832120, 1342.10120862091, 1349.87436003671, 1357.69253181502, 1365.55598470375, 1373.46498096101, 1381.41978436386, 1389.42066021709, 1397.46787536207, 1405.56169818566, 1413.70239862916, 1421.89024819732, 1430.12551996735, 1438.40848859811, 1446.73943033918, 1455.11862304015, 1463.54634615983, 1472.02288077560, 1480.54850959278, 1489.12351695405, 1497.74818884896, 1506.42281292342, 1515.14767848933, 1523.92307653424, 1532.74929973100, 1541.62664244758, 1550.55540075687, 1559.53587244652, 1568.56835702893, 1577.65315575119, 1586.79057160515, 1595.98090933752, 1605.22447546004, 1614.52157825970, 1623.87252780902, 1633.27763597637, 1642.73721643643, 1652.25158468057, 1661.82105802746, 1671.44595563357, 1681.12659850387, 1690.86330950254, 1700.65641336367, 1710.50623670219, 1720.41310802468, 1730.37735774035, 1740.39931817209, 1750.47932356751, 1760.61771011013, 1770.81481593056, 1781.07098111777, 1791.38654773047, 1801.76185980849, 1812.19726338428, 1822.69310649439, 1833.24973919117, 1843.86751355437, 1854.54678370290, 1865.28790580667, 1876.09123809842, 1886.95714088572, 1897.88597656295, 1908.87810962340, 1919.93390667141, 1931.05373643464, 1942.23796977631, 1953.48697970761, 1964.80114140013, 1976.18083219836, 1987.62643163230, 1999.13832143009, 2010.71688553076, 2022.36251009703, 2034.07558352819, 2045.85649647304, 2057.70564184295, 2069.62341482494, 2081.61021289485, 2093.66643583062, 2105.79248572563, 2117.98876700208, 2130.25568642449, 2142.59365311328, 2155.00307855839, 2167.48437663305, 2180.03796360751, 2192.66425816299, 2205.36368140561, 2218.13665688045, 2230.98361058567, 2243.90497098670, 2256.90116903057, 2269.97263816023, 2283.11981432906, 2296.34313601538, 2309.64304423707, 2323.01998256630, 2336.47439714430, 2350.00673669625, 2363.61745254626, 2377.30699863239, 2391.07583152183, 2404.92441042608, 2418.85319721631, 2432.86265643875, 2446.95325533015, 2461.12546383342, 2475.37975461326, 2489.71660307195, 2504.13648736517, 2518.63988841800, 2533.22728994092, 2547.89917844595, 2562.65604326287, 2577.49837655558, 2592.42667333845, 2607.44143149289, 2622.54315178390, 2637.73233787683, 2653.00949635412, 2668.37513673225, 2683.82977147868, 2699.37391602898, 2715.00808880401, 2730.73281122720, 2746.54860774196, 2762.45600582915, 2778.45553602469, 2794.54773193723, 2810.73313026598, 2827.01227081859, 2843.38569652914, 2859.85395347629, 2876.41759090143, 2893.07716122707, 2909.83322007521, 2926.68632628589, 2943.63704193583, 2960.68593235717, 2977.83356615633, 2995.08051523297, 3012.42735479908, 3029.87466339811, 3047.42302292436, 3065.07301864229, 3082.82523920610, 3100.68027667933, 3118.63872655464, 3136.70118777364, 3154.86826274688, 3173.14055737392, 3191.51868106358, 3210.00324675424, 3228.59487093426, 3247.29417366258, 3266.10177858938, 3285.01831297689, 3304.04440772028, 3323.18069736876, 3342.42782014667, 3361.78641797483, 3381.25713649192, 3400.84062507600, 3420.53753686619, 3440.34852878446, 3460.27426155751, 3480.31539973883, 3500.47261173084, 3520.74656980722, 3541.13795013531, 3561.64743279864, 3582.27570181963, 3603.02344518244, 3623.89135485586, 3644.88012681639, 3665.99046107151, 3687.22306168296, 3708.57863679025, 3730.05789863429, 3751.66156358112, 3773.39035214583, 3795.24498901654, 3817.22620307862, 3839.33472743900, 3861.57129945058, 3883.93666073684, 3906.43155721660, 3929.05673912885, 3951.81296105782, 3974.70098195812, 3997.72156518003, 4020.87547849502, 4044.16349412131, 4067.58638874962, 4091.14494356911, 4114.83994429340, 4138.67218118680, 4162.64244909065, 4186.75154744984, 4211.00028033944, 4235.38945649158, 4259.91988932237, 4284.59239695904, 4309.40780226723, 4334.36693287846, 4359.47062121767, 4384.71970453105, 4410.11502491392, 4435.65742933883, 4461.34776968380, 4487.18690276075, 4513.17569034406, 4539.31499919932, 4565.60570111221, 4592.04867291764, 4618.64479652890, 4645.39495896718, 4672.30005239106, 4699.36097412631, 4726.57862669584, 4753.95391784974, 4781.48776059559, 4809.18107322891, 4837.03477936380, 4865.04980796371, 4893.22709337244, 4921.56757534530, 4950.07219908047, 4978.74191525049, 5007.57768003397, 5036.58045514755, 5065.75120787785, 5095.09091111385, 5124.60054337925, 5154.28108886517, 5184.13353746292, 5214.15888479707, 5244.35813225861, 5274.73228703835, 5305.28236216055, 5336.00937651666, 5366.91435489933, 5397.99832803660, 5429.26233262622, 5460.70741137030, 5492.33461301002, 5524.14499236064, 5556.13961034670, 5588.31953403735, 5620.68583668198, 5653.23959774600, 5685.98190294683, 5718.91384429015, 5752.03652010628, 5785.35103508682, 5818.85850032150, 5852.56003333523, 5886.45675812540, 5920.54980519931, 5954.84031161193, 5989.32942100379, 6024.01828363913, 6058.90805644426, 6093.99990304616, 6129.29499381128, 6164.79450588456, 6200.49962322871, 6236.41153666368, 6272.53144390640, 6308.86054961070, 6345.40006540748, 6382.15120994518, 6419.11520893033, 6456.29329516852, 6493.68670860544, 6531.29669636831, 6569.12451280739, 6607.17141953789, 6645.43868548201, 6683.92758691125, 6722.63940748899, 6761.57543831332, 6800.73697796008, 6840.12533252614, 6879.74181567306, 6919.58774867075, 6959.66446044170, 6999.97328760516, 7040.51557452183, 7081.29267333861, 7122.30594403376, 7163.55675446221, 7205.04648040124, 7246.77650559628, 7288.74822180711, 7330.96302885431, 7373.42233466589, 7416.12755532425, 7459.08011511342, 7502.28144656662, 7545.73299051391, 7589.43619613038, 7633.39252098437, 7677.60343108616, 7722.07040093684, 7766.79491357745, 7811.77846063851, 7857.02254238971, 7902.52866778998, 7948.29835453778, 7994.33312912178, 8040.63452687170, 8087.20409200957, 8134.04337770120, 8181.15394610798, 8228.53736843901, 8276.19522500347, 8324.12910526335, 8372.34060788644, 8420.83134079967, 8469.60292124270, 8518.65697582192, 8567.99514056463, 8617.61906097364, 8667.53039208217, 8717.73079850900, 8768.22195451401, 8819.00554405404, 8870.08326083901, 8921.45680838846, 8973.12790008830, 9025.09825924802, 9077.36961915813, 9129.94372314795, 9182.82232464380, 9236.00718722745, 9289.50008469492, 9343.30280111571, 9397.41713089220, 9451.84487881958, 9506.58786014602, 9561.64790063317, 9617.02683661711, 9672.72651506957, 9728.74879365952, 9785.09554081512, 9841.76863578609, 9898.76996870632, 9956.10144065693, 10013.7649637297, 10071.7624610908, 10130.0958670449, 10188.7671270999, 10247.7781980315, 10307.1310479486, 10366.8276563590, 10426.8700142354, 10487.2601240818, 10548]
fs_default = 44100
frameSize_default = 4410
hopSize_default = 882
zeroPadding_default = 32768 - frameSize_default

loader = essentia.standard.MonoLoader(filename = 'piano.wav', sampleRate = fs_default)
audio = loader()

w = Windowing(type = 'hamming', size = frameSize_default, zeroPadding = zeroPadding_default)
spectrum = Spectrum()
triangularBands = TriangularBands(frequencyBands = frequencyBands_default)

spectro = []
filtbands = []
for frame in FrameGenerator(audio, frameSize = frameSize_default, hopSize = hopSize_default):
    spec = spectrum(w(frame))
    spectro.append(spec)
    tribands = triangularBands(spec)
    filtbands.append(tribands)

# Plot one frame
plt.figure
plt.plot(filtbands[0])

plt.figure
fig, (ax1, ax2, ax3) = plt.subplots(nrows=3)
# Plot spectrogram computed by 'spectgram' function
data, freqs, bins, im = ax1.specgram(audio, NFFT=frameSize_default, Fs=fs_default, noverlap=(frameSize_default - hopSize_default))
ax1.axis('tight')

# Plot spectrogram computed by Essentia
spectro = np.asarray(spectro)
spectro = spectro.T
ax2.pcolormesh(np.arange(len(spectro[0])), np.arange(len(spectro)), spectro)
ax2.axis('tight')

filtbands = np.asarray(filtbands)
filtbands = filtbands.T

# Plot filtered output
ax3.pcolormesh(np.arange(len(filtbands[0])), np.arange(len(filtbands)), filtbands)
ax3.axis('tight')

plt.show()

# Save to Matlab .mat format
scipy.io.savemat('filtbands.mat', mdict={'filtbands': filtbands})

NOTE: Each bin has 10 cents as in the implementation in [(https://github.com/MTG/essentia/blob/528c57494ae3322ffed4442f44082ef0608c0ee8/src/algorithms/tonal/pitchsaliencefunction.cpp)] However in the pitch salience, a frequency's magnitude content is distributed to 1) its corresponding bin + 2) the bins within one semitone up and one semitone below e.g. in the range (h_bin-_binsInSemitone ; h_bin + binsInSemitone) with weigths _nearestBinsWeights Unlike that this implementation maps a couple of freq bins with weights given by the triangular bands to one freq bin. Note that spectrum has big zero-padding to get a very big window for good resolution.

ported from the implementation from MATLAB of the fluctogram feature 2.1 from the paper https://www.researchgate.net/publication/262259403_On_the_reduction_of_false_positives_in_singing_voice_detection

georgid commented 8 years ago

Function to create_filter

[x, fs] = audioread('piano.wav');

no_filters = 720;   % 
lo_fq = 164;   % E3
hi_fq = 10548;   % E9
R = [lo_fq hi_fq ];
spectrum_size = 1025;

hz2scale = @( hz )(120*log2(hz/440));
scale2hz = @( scale )(power(2, ((scale)/120))*440);

[H, freq, C] = trifbank(no_filters, spectrum_size, R, fs, hz2scale, scale2hz);

% Plot all filters
figure
for i = 1 : no_filters
    plot(H(i,:))
    hold on
end

% center bins
center_f = C(2:end-1);

where trifbank is from htk_mfcc :

`function [ H, f, c ] = trifbank( M, K, R, fs, h2w, w2h ) % TRIFBANK Triangular filterbank. % % [H,F,C]=TRIFBANK(M,K,R,FS,H2W,W2H) returns matrix of M triangular filters % (one per row), each K coefficients long along with a K coefficient long % frequency vector F and M+2 coefficient long cutoff frequency vector C. % The triangular filters are between limits given in R (Hz) and are % uniformly spaced on a warped scale defined by forward (H2W) and backward % (W2H) warping functions. % % Inputs % M is the number of filters, i.e., number of rows of H % % K is the length of frequency response of each filter % i.e., number of columns of H % % R is a two element vector that specifies frequency limits (Hz), % i.e., R = [ low_frequency high_frequency ]; % % FS is the sampling frequency (Hz) % % H2W is a Hertz scale to warped scale function handle % % W2H is a wared scale to Hertz scale function handle % % Outputs % H is a M by K triangular filterbank matrix (one filter per row) % % F is a frequency vector (Hz) of 1xK dimension % % C is a vector of filter cutoff frequencies (Hz), % note that C(2:end) also represents filter center frequencies, % and the dimension of C is 1x(M+2) % % Example % fs = 16000; % sampling frequency (Hz) % nfft = 2^12; % fft size (number of frequency bins) % K = nfft/2+1; % length of each filter % M = 23; % number of filters % % hz2mel = @(hz)(1127_log(1+hz/700)); % Hertz to mel warping function % mel2hz = @(mel)(700_exp(mel/1127)-700); % mel to Hertz warping function % % % Design mel filterbank of M filters each K coefficients long, % % filters are uniformly spaced on the mel scale between 0 and Fs/2 Hz % [ H1, freq ] = trifbank( M, K, [0 fs/2], fs, hz2mel, mel2hz ); % % % Design mel filterbank of M filters each K coefficients long, % % filters are uniformly spaced on the mel scale between 300 and 3750 Hz % [ H2, freq ] = trifbank( M, K, [300 3750], fs, hz2mel, mel2hz ); % % % Design mel filterbank of 18 filters each K coefficients long, % % filters are uniformly spaced on the Hertz scale between 4 and 6 kHz % [ H3, freq ] = trifbank( 18, K, [4 6]*1E3, fs, @(h)(h), @(h)(h) ); % % hfig = figure('Position', [25 100 800 600], 'PaperPositionMode', ... % 'auto', 'Visible', 'on', 'color', 'w'); hold on; % subplot( 3,1,1 ); % plot( freq, H1 ); % xlabel( 'Frequency (Hz)' ); ylabel( 'Weight' ); set( gca, 'box', 'off' ); %
% subplot( 3,1,2 ); % plot( freq, H2 ); % xlabel( 'Frequency (Hz)' ); ylabel( 'Weight' ); set( gca, 'box', 'off' ); %
% subplot( 3,1,3 ); % plot( freq, H3 ); % xlabel( 'Frequency (Hz)' ); ylabel( 'Weight' ); set( gca, 'box', 'off' ); % % Reference % [1] Huang, X., Acero, A., Hon, H., 2001. Spoken Language Processing: % A guide to theory, algorithm, and system development. % Prentice Hall, Upper Saddle River, NJ, USA (pp. 314-315).

% Author Kamil Wojcicki, UTD, June 2011

if( nargin~= 6 ), help trifbank; return; end; % very lite input validation

f_min = 0;          % filter coefficients start at this frequency (Hz)
f_low = R(1);       % lower cutoff frequency (Hz) for the filterbank 
f_high = R(2);      % upper cutoff frequency (Hz) for the filterbank 
f_max = 0.5*fs;     % filter coefficients end at this frequency (Hz)
f = linspace( f_min, f_max, K ); % frequency range (Hz), size 1xK
fw = h2w( f );

% filter cutoff frequencies (Hz) for all filters, size 1x(M+2)
c = w2h( h2w(f_low)+[0:M+1]*((h2w(f_high)-h2w(f_low))/(M+1)) );
cw = h2w( c );

H = zeros( M, K );                  % zero otherwise
for m = 1:M 

    % implements Eq. (6.140) on page 314 of [1] 
    % k = f>=c(m)&f<=c(m+1); % up-slope
    % H(m,k) = 2*(f(k)-c(m)) / ((c(m+2)-c(m))*(c(m+1)-c(m)));
    % k = f>=c(m+1)&f<=c(m+2); % down-slope
    % H(m,k) = 2*(c(m+2)-f(k)) / ((c(m+2)-c(m))*(c(m+2)-c(m+1)));

    % implements Eq. (6.141) on page 315 of [1]
    k = f>=c(m)&f<=c(m+1); % up-slope
    H(m,k) = (f(k)-c(m))/(c(m+1)-c(m));
    k = f>=c(m+1)&f<=c(m+2); % down-slope
    H(m,k) = (c(m+2)-f(k))/(c(m+2)-c(m+1));

end

% H = H./repmat(max(H,[],2),1,K); % normalize to unit height (inherently done) % H = H./repmat(trapz(f,H,2),1,K); % normalize to unit area

% EOF `

dbogdanov commented 7 years ago

Implement new standard algorithm SpectrumToCent which computes triangular bands and has

Re-use TriangularBands algorithm inside compute() and also re-use the code for TriangularBands as a template.

dbogdanov commented 7 years ago

After reviewing, it seems that it is better (and simpler) to provide number of cent bins instead of maximum frequency as a parameter. The reasons are:

@pabloEntropia, @georgid

palonso commented 7 years ago

I have created my propose for the algorithm, #495 . I tried to stick to your indications, however I also included an second output with the central frequencies of each band as I think it is useful and may help to the user to save time.