Closed nateraw closed 1 year ago
For those lurking...
def get_timesteps_arr(y, sr, duration=2, smooth=0.0, fps=30, margin=1.0):
y = librosa.to_mono(y)
# librosa.stft hardcoded defaults...
# under hood hop length is win_length // 4
# under hood win_length defaults to n_fft
D = librosa.stft(y, n_fft=2048, hop_length=2048 // 4, win_length=2048)
# Extract percussive elements
D_harmonic, D_percussive = librosa.decompose.hpss(D, margin=margin)
y_percussive = librosa.istft(D_percussive, length=len(y))
# Get normalized melspectrogram
spec_raw = librosa.feature.melspectrogram(y=y_percussive, sr=sr)
spec_max = np.amax(spec_raw, axis=0)
spec_norm = (spec_max - np.min(spec_max)) / np.ptp(spec_max)
# Resize cumsum of spec norm to our desired number of interpolation frames
x_norm = np.linspace(0, spec_norm.shape[-1], spec_norm.shape[-1])
y_norm = np.cumsum(spec_norm)
y_norm /= y_norm[-1]
x_resize = np.linspace(0, y_norm.shape[-1], int(duration*fps))
T = np.interp(x_resize, x_norm, y_norm)
# Apply smoothing
return T * (1 - smooth) + np.linspace(0.0, 1.0, T.shape[0]) * smooth
# Start 2 seconds into audio clip and take 4 second sample
offset = 2
duration = 4
# 4 seconds at 25fps is 100 frames to generate
fps = 25
y, sr = librosa.load(audio_filepath, offset=offset, duration=duration)
T = get_timesteps_arr(y, sr, duration, fps=fps)
When calculating
T
for music videos, we're currently doing anp.resize
to get to the size ofnum_frames
we want to generate. this actually screws up the scaling. We need to interpolate instead to accurately rescale the array and maintain the shape.Did some tests locally with this and it works great!! Much better than current implementation.