asteroid-team / torch-audiomentations

Fast audio data augmentation in PyTorch. Inspired by audiomentations. Useful for deep learning.
MIT License
969 stars 88 forks source link

There's an off-by-one error in AddBackgroundNoise that we need to fix before v0.5.0 can be released #62

Closed iver56 closed 3 years ago

iver56 commented 3 years ago

There's an off-by-one error in AddBackgroundNoise that we need to fix before v0.5.0 can be released:

============================================================= FAILURES ==============================================================
____________________________ TestAddBackgroundNoise.test_background_noise_guaranteed_with_batched_tensor ____________________________

self = <tests.test_background_noise.TestAddBackgroundNoise testMethod=test_background_noise_guaranteed_with_batched_tensor>

    def test_background_noise_guaranteed_with_batched_tensor(self):
        mixed_inputs = self.bg_noise_transform_guaranteed(
>           self.input_audios, self.sample_rate
        )

tests\test_background_noise.py:85:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
..\..\anaconda3\envs\torch-audiomentations-gpu\lib\site-packages\torch\nn\modules\module.py:722: in _call_impl
    result = self.forward(*input, **kwargs)
torch_audiomentations\core\transforms_interface.py:164: in forward
    self.randomize_parameters(selected_samples, sample_rate)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = AddBackgroundNoise()
selected_samples = tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0062, 0.0042]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0...0077, 0.0062, 0.0042]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0062, 0.0042]]],
       dtype=torch.float64)
sample_rate = 16000

    def randomize_parameters(
        self, selected_samples: torch.Tensor, sample_rate: int = None
    ):
        """

        :params selected_samples: (batch_size, num_channels, num_samples)
        """

        batch_size, _, num_samples = selected_samples.shape

        # (batch_size, num_samples) RMS-normalized background noise
        audio = self.audio if hasattr(self, "audio") else Audio(sample_rate, mono=True)
        self.transform_parameters["background"] = torch.stack(
>           [self.random_background(audio, num_samples) for _ in range(batch_size)]
        )
E       RuntimeError: stack expects each tensor to be equal size, but got [1, 140544] at entry 0 and [1, 140545] at entry 1

torch_audiomentations\augmentations\background_noise.py:113: RuntimeError
____________________________ TestAddBackgroundNoise.test_background_noise_guaranteed_with_single_tensor _____________________________

self = <tests.test_background_noise.TestAddBackgroundNoise testMethod=test_background_noise_guaranteed_with_single_tensor>

    def test_background_noise_guaranteed_with_single_tensor(self):
        mixed_input = self.bg_noise_transform_guaranteed(
>           self.input_audio, self.sample_rate
        )

tests\test_background_noise.py:77:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
..\..\anaconda3\envs\torch-audiomentations-gpu\lib\site-packages\torch\nn\modules\module.py:722: in _call_impl
    result = self.forward(*input, **kwargs)
torch_audiomentations\core\transforms_interface.py:168: in forward
    ] = self.apply_transform(selected_samples, sample_rate)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = AddBackgroundNoise()
selected_samples = tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0062, 0.0042]]],
       dtype=torch.float64), sample_rate = 16000

    def apply_transform(self, selected_samples: torch.Tensor, sample_rate: int = None):

        batch_size, num_channels, num_samples = selected_samples.shape

        # (batch_size, num_samples)
        background = self.transform_parameters["background"].to(selected_samples.device)

        # (batch_size, num_channels)
        background_rms = calculate_rms(selected_samples) / (
            10 ** (self.transform_parameters["snr_in_db"].unsqueeze(dim=-1) / 20)
        )

        return selected_samples + background_rms.unsqueeze(-1) * background.view(
>           batch_size, 1, num_samples
        ).expand(-1, num_channels, -1)
E       RuntimeError: shape '[1, 1, 140544]' is invalid for input of size 140545

torch_audiomentations\augmentations\background_noise.py:134: RuntimeError
_______________________________________ TestAddBackgroundNoise.test_varying_snr_within_batch ________________________________________

self = <tests.test_background_noise.TestAddBackgroundNoise testMethod=test_varying_snr_within_batch>

    def test_varying_snr_within_batch(self):
        min_snr_in_db = 3
        max_snr_in_db = 30
        augment = AddBackgroundNoise(
            self.bg_path, min_snr_in_db=3, max_snr_in_db=30, p=1.0
        )
>       augmented_audios = augment(self.input_audios, self.sample_rate)

tests\test_background_noise.py:105:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
..\..\anaconda3\envs\torch-audiomentations-gpu\lib\site-packages\torch\nn\modules\module.py:722: in _call_impl
    result = self.forward(*input, **kwargs)
torch_audiomentations\core\transforms_interface.py:164: in forward
    self.randomize_parameters(selected_samples, sample_rate)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = AddBackgroundNoise()
selected_samples = tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0062, 0.0042]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0...0077, 0.0062, 0.0042]],

        [[0.0000, 0.0000, 0.0000,  ..., 0.0077, 0.0062, 0.0042]]],
       dtype=torch.float64)
sample_rate = 16000

    def randomize_parameters(
        self, selected_samples: torch.Tensor, sample_rate: int = None
    ):
        """

        :params selected_samples: (batch_size, num_channels, num_samples)
        """

        batch_size, _, num_samples = selected_samples.shape

        # (batch_size, num_samples) RMS-normalized background noise
        audio = self.audio if hasattr(self, "audio") else Audio(sample_rate, mono=True)
        self.transform_parameters["background"] = torch.stack(
>           [self.random_background(audio, num_samples) for _ in range(batch_size)]
        )
E       RuntimeError: stack expects each tensor to be equal size, but got [1, 140544] at entry 0 and [1, 140545] at entry 2

torch_audiomentations\augmentations\background_noise.py:113: RuntimeError
____________________________________________ test_transform_is_differentiable[augment0] _____________________________________________

augment = AddBackgroundNoise()

    @pytest.mark.parametrize(
        "augment",
        [
            # Differentiable transforms:
            AddBackgroundNoise(BG_NOISE_PATH, 20, p=1.0),
            ApplyImpulseResponse(IR_PATH, p=1.0),
            Compose(
                transforms=[
                    Gain(min_gain_in_db=-15.0, max_gain_in_db=5.0, p=1.0),
                    PolarityInversion(p=1.0),
                ]
            ),
            Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0),
            PolarityInversion(p=1.0),
            Shift(p=1.0),
            # Non-differentiable transforms:
            # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation:
            # [torch.DoubleTensor [1, 5]], which is output 0 of IndexBackward, is at version 1; expected version 0 instead.
            # Hint: enable anomaly detection to find the operation that failed to compute its gradient,
            # with torch.autograd.set_detect_anomaly(True).
            pytest.param(
                PeakNormalization(p=1.0), marks=pytest.mark.skip("Not differentiable")
            ),
        ],
    )
    def test_transform_is_differentiable(augment):
        sample_rate = 16000
        # Note: using float64 dtype to be compatible with AddBackgroundNoise fixtures
        samples = torch.tensor(
            [[1.0, 0.5, -0.25, -0.125, 0.0]], dtype=torch.float64
        ).unsqueeze(1)
        samples_cpy = deepcopy(samples)

        # We are going to convert the input tensor to a nn.Parameter so that we can
        # track the gradients with respect to it. We'll "optimize" the input signal
        # to be closer to that after the augmentation to test differentiability
        # of the transform. If the signal got changed in any way, and the test
        # didn't crash, it means it works.
        samples = torch.nn.Parameter(samples)
        optim = SGD([samples], lr=1.0)
        for i in range(10):
            optim.zero_grad()
>           transformed = augment(samples=samples, sample_rate=sample_rate)

tests\test_differentiable.py:64:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
..\..\anaconda3\envs\torch-audiomentations-gpu\lib\site-packages\torch\nn\modules\module.py:722: in _call_impl
    result = self.forward(*input, **kwargs)
torch_audiomentations\core\transforms_interface.py:168: in forward
    ] = self.apply_transform(selected_samples, sample_rate)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = AddBackgroundNoise()
selected_samples = tensor([[[ 1.0000,  0.5000, -0.2500, -0.1250,  0.0000]]], dtype=torch.float64,
       grad_fn=<IndexBackward>)
sample_rate = 16000

    def apply_transform(self, selected_samples: torch.Tensor, sample_rate: int = None):

        batch_size, num_channels, num_samples = selected_samples.shape

        # (batch_size, num_samples)
        background = self.transform_parameters["background"].to(selected_samples.device)

        # (batch_size, num_channels)
        background_rms = calculate_rms(selected_samples) / (
            10 ** (self.transform_parameters["snr_in_db"].unsqueeze(dim=-1) / 20)
        )

        return selected_samples + background_rms.unsqueeze(-1) * background.view(
>           batch_size, 1, num_samples
        ).expand(-1, num_channels, -1)
E       RuntimeError: shape '[1, 1, 5]' is invalid for input of size 6

torch_audiomentations\augmentations\background_noise.py:134: RuntimeError

Originally posted by @iver56 in https://github.com/asteroid-team/torch-audiomentations/issues/61#issuecomment-739934066