import math import unittest import torch import torchaudio import torchaudio.functional as F import pytest from . import common_utils from .functional_impl import Lfilter def random_float_tensor(seed, size, a=22695477, c=1, m=2 ** 32): """ Generates random tensors given a seed and size https://en.wikipedia.org/wiki/Linear_congruential_generator X_{n + 1} = (a * X_n + c) % m Using Borland C/C++ values The tensor will have values between [0,1) Inputs: seed (int): an int size (Tuple[int]): the size of the output tensor a (int): the multiplier constant to the generator c (int): the additive constant to the generator m (int): the modulus constant to the generator """ num_elements = 1 for s in size: num_elements *= s arr = [(a * seed + c) % m] for i in range(num_elements - 1): arr.append((a * arr[i] + c) % m) return torch.tensor(arr).float().view(size) / m class TestLFilterFloat32(Lfilter, common_utils.PytorchTestCase): dtype = torch.float32 device = torch.device('cpu') class TestLFilterFloat64(Lfilter, common_utils.PytorchTestCase): dtype = torch.float64 device = torch.device('cpu') class TestComputeDeltas(common_utils.TorchaudioTestCase): """Test suite for correctness of compute_deltas""" def test_one_channel(self): specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0]]]) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]]) computed = F.compute_deltas(specgram, win_length=3) torch.testing.assert_allclose(computed, expected) def test_two_channels(self): specgram = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]]) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]]) computed = F.compute_deltas(specgram, win_length=3) torch.testing.assert_allclose(computed, expected) def _compare_estimate(sound, estimate, atol=1e-6, rtol=1e-8): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] torch.testing.assert_allclose(estimate, sound, atol=atol, rtol=rtol) def _test_istft_is_inverse_of_stft(kwargs): # generates a random sound signal for each tril and then does the stft/istft # operation to check whether we can reconstruct signal for data_size in [(2, 20), (3, 15), (4, 10)]: for i in range(100): sound = random_float_tensor(i, data_size) stft = torch.stft(sound, **kwargs) estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) _compare_estimate(sound, estimate) class TestIstft(common_utils.TorchaudioTestCase): """Test suite for correctness of istft with various input""" number_of_trials = 100 def test_istft_is_inverse_of_stft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'hop_length': 4, 'win_length': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': True, 'onesided': True, } _test_istft_is_inverse_of_stft(kwargs1) def test_istft_is_inverse_of_stft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'hop_length': 2, 'win_length': 8, 'window': torch.hann_window(8), 'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs2) def test_istft_is_inverse_of_stft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 15, 'hop_length': 3, 'win_length': 11, 'window': torch.hamming_window(11), 'center': True, 'pad_mode': 'constant', 'normalized': True, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs3) def test_istft_is_inverse_of_stft4(self): # hamming_window, not centered, not normalized, onesided # window same size as n_fft kwargs4 = { 'n_fft': 5, 'hop_length': 2, 'win_length': 5, 'window': torch.hamming_window(5), 'center': False, 'pad_mode': 'constant', 'normalized': False, 'onesided': True, } _test_istft_is_inverse_of_stft(kwargs4) def test_istft_is_inverse_of_stft5(self): # hamming_window, not centered, not normalized, not onesided # window same size as n_fft kwargs5 = { 'n_fft': 3, 'hop_length': 2, 'win_length': 3, 'window': torch.hamming_window(3), 'center': False, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs5) def test_istft_of_ones(self): # stft = torch.stft(torch.ones(4), 4) stft = torch.tensor([ [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]] ]) estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) _compare_estimate(torch.ones(4), estimate) def test_istft_of_zeros(self): # stft = torch.stft(torch.zeros(4), 4) stft = torch.zeros((3, 5, 2)) estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) _compare_estimate(torch.zeros(4), estimate) def test_istft_requires_overlap_windows(self): # the window is size 1 but it hops 20 so there is a gap which throw an error stft = torch.zeros((3, 5, 2)) self.assertRaises(RuntimeError, torchaudio.functional.istft, stft, n_fft=4, hop_length=20, win_length=1, window=torch.ones(1)) def test_istft_requires_nola(self): stft = torch.zeros((3, 5, 2)) kwargs_ok = { 'n_fft': 4, 'win_length': 4, 'window': torch.ones(4), } kwargs_not_ok = { 'n_fft': 4, 'win_length': 4, 'window': torch.zeros(4), } # A window of ones meets NOLA but a window of zeros does not. This should # throw an error. torchaudio.functional.istft(stft, **kwargs_ok) self.assertRaises(RuntimeError, torchaudio.functional.istft, stft, **kwargs_not_ok) def test_istft_requires_non_empty(self): self.assertRaises(RuntimeError, torchaudio.functional.istft, torch.zeros((3, 0, 2)), 2) self.assertRaises(RuntimeError, torchaudio.functional.istft, torch.zeros((0, 3, 2)), 2) def _test_istft_of_sine(self, amplitude, L, n): # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype()) sound = amplitude * torch.sin(2 * math.pi / L * x * n) # stft = torch.stft(sound, L, hop_length=L, win_length=L, # window=torch.ones(L), center=False, normalized=False) stft = torch.zeros((L // 2 + 1, 2, 2)) stft_largest_val = (amplitude * L) / 2.0 if n < stft.size(0): stft[n, :, 1] = -stft_largest_val if 0 <= L - n < stft.size(0): # symmetric about L // 2 stft[L - n, :, 1] = stft_largest_val estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, window=torch.ones(L), center=False, normalized=False) # There is a larger error due to the scaling of amplitude _compare_estimate(sound, estimate, atol=1e-3) def test_istft_of_sine(self): self._test_istft_of_sine(amplitude=123, L=5, n=1) self._test_istft_of_sine(amplitude=150, L=5, n=2) self._test_istft_of_sine(amplitude=111, L=5, n=3) self._test_istft_of_sine(amplitude=160, L=7, n=4) self._test_istft_of_sine(amplitude=145, L=8, n=5) self._test_istft_of_sine(amplitude=80, L=9, n=6) self._test_istft_of_sine(amplitude=99, L=10, n=7) def _test_linearity_of_istft(self, data_size, kwargs, atol=1e-6, rtol=1e-8): for i in range(self.number_of_trials): tensor1 = random_float_tensor(i, data_size) tensor2 = random_float_tensor(i * 2, data_size) a, b = torch.rand(2) istft1 = torchaudio.functional.istft(tensor1, **kwargs) istft2 = torchaudio.functional.istft(tensor2, **kwargs) istft = a * istft1 + b * istft2 estimate = torchaudio.functional.istft(a * tensor1 + b * tensor2, **kwargs) _compare_estimate(istft, estimate, atol, rtol) def test_linearity_of_istft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': True, 'onesided': True, } data_size = (2, 7, 7, 2) self._test_linearity_of_istft(data_size, kwargs1) def test_linearity_of_istft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs2) def test_linearity_of_istft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': True, 'pad_mode': 'constant', 'normalized': True, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs3) def test_linearity_of_istft4(self): # hamming_window, not centered, not normalized, onesided kwargs4 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': False, 'pad_mode': 'constant', 'normalized': False, 'onesided': True, } data_size = (2, 7, 3, 2) self._test_linearity_of_istft(data_size, kwargs4, atol=1e-5, rtol=1e-8) class TestDetectPitchFrequency(common_utils.TorchaudioTestCase): backend = 'default' def test_pitch(self): test_filepath_100 = common_utils.get_asset_path("100Hz_44100Hz_16bit_05sec.wav") test_filepath_440 = common_utils.get_asset_path("440Hz_44100Hz_16bit_05sec.wav") # Files from https://www.mediacollege.com/audio/tone/download/ tests = [ (test_filepath_100, 100), (test_filepath_440, 440), ] for filename, freq_ref in tests: waveform, sample_rate = torchaudio.load(filename) freq = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate) threshold = 1 s = ((freq - freq_ref).abs() > threshold).sum() self.assertFalse(s) class TestDB_to_amplitude(common_utils.TorchaudioTestCase): def test_DB_to_amplitude(self): # Make some noise x = torch.rand(1000) spectrogram = torchaudio.transforms.Spectrogram() spec = spectrogram(x) amin = 1e-10 ref = 1.0 db_multiplier = math.log10(max(amin, ref)) # Waveform amplitude -> DB -> amplitude multiplier = 20. power = 0.5 db = F.amplitude_to_DB(torch.abs(x), multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) torch.testing.assert_allclose(x2, torch.abs(x), atol=5e-5, rtol=1e-5) # Spectrogram amplitude -> DB -> amplitude db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) torch.testing.assert_allclose(x2, spec, atol=5e-5, rtol=1e-5) # Waveform power -> DB -> power multiplier = 10. power = 1. db = F.amplitude_to_DB(x, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) torch.testing.assert_allclose(x2, torch.abs(x), atol=5e-5, rtol=1e-5) # Spectrogram power -> DB -> power db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) torch.testing.assert_allclose(x2, spec, atol=5e-5, rtol=1e-5) @pytest.mark.parametrize('complex_tensor', [ torch.randn(1, 2, 1025, 400, 2), torch.randn(1025, 400, 2) ]) @pytest.mark.parametrize('power', [1, 2, 0.7]) def test_complex_norm(complex_tensor, power): expected_norm_tensor = complex_tensor.pow(2).sum(-1).pow(power / 2) norm_tensor = F.complex_norm(complex_tensor, power) torch.testing.assert_allclose(norm_tensor, expected_norm_tensor, atol=1e-5, rtol=1e-5) @pytest.mark.parametrize('specgram', [ torch.randn(2, 1025, 400), torch.randn(1, 201, 100) ]) @pytest.mark.parametrize('mask_param', [100]) @pytest.mark.parametrize('mask_value', [0., 30.]) @pytest.mark.parametrize('axis', [1, 2]) def test_mask_along_axis(specgram, mask_param, mask_value, axis): mask_specgram = F.mask_along_axis(specgram, mask_param, mask_value, axis) other_axis = 1 if axis == 2 else 2 masked_columns = (mask_specgram == mask_value).sum(other_axis) num_masked_columns = (masked_columns == mask_specgram.size(other_axis)).sum() num_masked_columns //= mask_specgram.size(0) assert mask_specgram.size() == specgram.size() assert num_masked_columns < mask_param @pytest.mark.parametrize('mask_param', [100]) @pytest.mark.parametrize('mask_value', [0., 30.]) @pytest.mark.parametrize('axis', [2, 3]) def test_mask_along_axis_iid(mask_param, mask_value, axis): torch.random.manual_seed(42) specgrams = torch.randn(4, 2, 1025, 400) mask_specgrams = F.mask_along_axis_iid(specgrams, mask_param, mask_value, axis) other_axis = 2 if axis == 3 else 3 masked_columns = (mask_specgrams == mask_value).sum(other_axis) num_masked_columns = (masked_columns == mask_specgrams.size(other_axis)).sum(-1) assert mask_specgrams.size() == specgrams.size() assert (num_masked_columns < mask_param).sum() == num_masked_columns.numel() if __name__ == '__main__': unittest.main()