Fix resize issue in get_timesteps_arr

For those lurking...
def get_timesteps_arr(y, sr, duration=2, smooth=0.0, fps=30, margin=1.0):
    y = librosa.to_mono(y)

    # librosa.stft hardcoded defaults...
    # under hood hop length is win_length // 4
    # under hood win_length defaults to n_fft
    D = librosa.stft(y, n_fft=2048, hop_length=2048 // 4, win_length=2048)

    # Extract percussive elements
    D_harmonic, D_percussive = librosa.decompose.hpss(D, margin=margin)
    y_percussive = librosa.istft(D_percussive, length=len(y))

    # Get normalized melspectrogram
    spec_raw = librosa.feature.melspectrogram(y=y_percussive, sr=sr)
    spec_max = np.amax(spec_raw, axis=0)
    spec_norm = (spec_max - np.min(spec_max)) / np.ptp(spec_max)

    # Resize cumsum of spec norm to our desired number of interpolation frames
    x_norm = np.linspace(0, spec_norm.shape[-1], spec_norm.shape[-1])
    y_norm = np.cumsum(spec_norm)
    y_norm /= y_norm[-1]
    x_resize = np.linspace(0, y_norm.shape[-1], int(duration*fps))

    T = np.interp(x_resize, x_norm, y_norm)

    # Apply smoothing
    return T * (1 - smooth) + np.linspace(0.0, 1.0, T.shape[0]) * smooth

# Start 2 seconds into audio clip and take 4 second sample
offset = 2
duration = 4

# 4 seconds at 25fps is 100 frames to generate
fps = 25

y, sr = librosa.load(audio_filepath, offset=offset, duration=duration)
T = get_timesteps_arr(y, sr, duration, fps=fps)
nateraw / stable-diffusion-videos

Fix resize issue in get_timesteps_arr #94