python source code of test_corrupt

# Licensed under the GPLv3 - see LICENSE
import io

import pytest
import numpy as np
from astropy import units as u
from astropy.time import Time

from ... import vdif
from ...data import SAMPLE_VDIF as SAMPLE_FILE


class TestCorruptSampleCopy:
    @classmethod
    def setup_class(cls):
        # Make a triply-long sample file - this since otherwise
        # things already fail at the determination of thread_ids.
        with vdif.open(SAMPLE_FILE, 'rs') as fs, \
                io.BytesIO() as s, \
                vdif.open(s, 'ws', header0=fs.header0, nthread=8) as fw:

            data = fs.read()
            for i in range(3):
                fw.write(data)

            cls.data = np.concatenate([data, data, data])

            s.seek(0)
            cls.sample_bytes = s.read()
            cls.frame_nbytes = fw.header0.frame_nbytes
            cls.start_time = fw.start_time
            cls.stop_time = fw.tell('time')

    def test_sample_bytes(self, tmpdir):
        test_file = str(tmpdir.join('test.vdif'))
        with open(test_file, 'wb') as fh:
            fh.write(self.sample_bytes)
        with vdif.open(test_file, 'rs') as fs:
            data = fs.read()
        assert np.all(data == self.data)

    # Have 6 framesets, so 48 frames.
    @pytest.mark.parametrize('missing', (
        36, slice(46, 48), [30, 45], slice(8, 16), 0, slice(4, 12)))
    def test_missing_frames(self, missing, tmpdir):
        """Purely missing frames should just be marked invalid."""
        # Even at the very start; gh-359
        sample = np.frombuffer(self.sample_bytes, 'u1').reshape(-1, 5032)
        use = np.ones(len(sample), bool)
        use[missing] = False
        reduced = sample[use]
        corrupt_file = str(tmpdir.join('missing_frames.vdif'))
        with open(corrupt_file, 'wb') as s:
            s.write(reduced.tobytes())

        with vdif.open(corrupt_file, 'rb') as fr:
            assert 'number_of_frames' not in fr.info.warnings
            if np.count_nonzero(use) % 8 == 0:
                assert 'number_of_framesset' not in fr.info.warnings
            else:
                assert 'number_of_framesets' in fr.info.warnings

        with vdif.open(corrupt_file, 'rs') as fh:
            with pytest.warns(UserWarning,
                              match='problem loading frame'):
                data = fh.read()

        # Get data in frame order to zero expected bad frames.
        expected = (self.data.copy().reshape(-1, 20000, 8)
                    .transpose(0, 2, 1).reshape(-1, 20000))
        expected[missing] = 0.
        # Back into regular order
        expected = (expected.reshape(-1, 8, 20000)
                    .transpose(0, 2, 1).reshape(-1, 8))

        assert np.all(expected == data)

    def expected_bad_frames(self, missing):
        (start_f, start_r), (stop_f, stop_i) = [
            divmod(s, self.frame_nbytes)
            for s in (missing.start, missing.stop-1)]

        if start_r < 32 and start_f % 8 != 0:
            start_f -= 1

        return start_f, stop_f+1

    @pytest.mark.parametrize('missing,expected_bad_start,expected_bad_stop', [
        (slice(50320, 50321), 9, 11),  # First byte of header of frame 10.
        (slice(50500, 50600), 10, 11),  # Part of payload of frame 10.
        (slice(60000, 70000), 11, 14),  # Parts of 11-13.
        (slice(75490, 75500), 14, 16),  # Part of header of frame 15.
        (slice(80511, 80512), 15, 16)])  # Last byte of last frame.
    def test_expected_bad_frames(self, missing, expected_bad_start,
                                 expected_bad_stop):
        bad_start, bad_stop = self.expected_bad_frames(missing)
        assert bad_start == expected_bad_start
        assert bad_stop == expected_bad_stop

    # Keep frames in first three frame sets intact for get_thread_ids()
    @pytest.mark.parametrize('missing', [
        (slice(5032*26, 5032*26+1)),  # First byte of header of frame 26.
        (slice(5032*26+50, 5032*26+60)),  # Part of payload of frame 26.
        (slice(5032*27+50, 5032*29+700)),  # Parts of 27-29
        (slice(5032*31+10, 5032*31+20)),  # Part of header of frame 31.
        (slice(5032*48-1, 5032*48))])  # Last byte of last frame.
    def test_missing_bytes(self, missing, tmpdir):
        corrupted = (self.sample_bytes[:missing.start]
                     + self.sample_bytes[missing.stop:])
        bad_start, bad_stop = self.expected_bad_frames(missing)

        filename = str(tmpdir.join('corrupted.vdif'))
        with open(filename, 'wb') as fw:
            fw.write(corrupted)

        with vdif.open(filename, 'rb') as fr:
            assert 'number_of_frames' in fr.info.warnings

        # Check that bad frames are found with verify only.
        with vdif.open(filename, 'rs', verify=True) as fv:
            assert not fv.info.readable
            assert not fv.info.checks['continuous']
            assert 'continuous' in fv.info.errors
            # Reading will fail the frameset *before* the one tested.
            expected_msg = 'While reading at {}'.format(
                (bad_start // 8 - 1) * fv.samples_per_frame)
            assert expected_msg in fv.info.errors['continuous']

        # While only warnings are given when it is fixable.
        with vdif.open(filename, 'rs', verify='fix') as ff:
            assert ff.info.readable
            assert 'fixable' in ff.info.checks['continuous']
            assert 'continuous' in ff.info.warnings
            assert expected_msg in ff.info.warnings['continuous']
            assert 'problem loading frame' in ff.info.warnings['continuous']

        # Now check that data is properly marked as invalid.
        with vdif.open(filename, 'rs') as fr:
            assert fr.start_time == self.start_time
            assert fr.stop_time == self.stop_time
            with pytest.warns(UserWarning,
                              match='problem loading frame'):
                data = fr.read()

        # Get data in frame order to zero expected bad frames.
        expected = (self.data.copy().reshape(-1, 20000, 8)
                    .transpose(0, 2, 1).reshape(-1, 20000))
        expected[bad_start:bad_stop] = 0.
        # Back into regular order
        expected = (expected.reshape(-1, 8, 20000)
                    .transpose(0, 2, 1).reshape(-1, 8))

        assert np.all(data == expected)


class TestCorruptFile:
    @classmethod
    def setup_class(cls):
        cls.header0 = vdif.VDIFHeader.fromvalues(
            edv=1, time=Time('2010-11-12T13:14:15'), nchan=2, bps=2,
            complex_data=False, thread_id=0, samples_per_frame=16,
            station='me', sample_rate=2*u.kHz)
        cls.nthread = 2
        cls.data = np.array([[[-1, 1],
                              [-3, 3]]]*16)
        cls.frameset_nbytes = cls.header0.frame_nbytes * cls.nthread

    def fake_file(self, tmpdir, nframes=16):
        filename = str(tmpdir.join('fake.vdif'))
        with vdif.open(filename, 'ws', header0=self.header0,
                       nthread=self.nthread) as fw:
            for _ in range(nframes):
                fw.write(self.data)
        return filename

    def corrupt_copy(self, filename, missing):
        corrupt_name = filename.replace('.vdif', '_corrupt.vdif')
        with open(filename, 'rb') as fr, \
                open(corrupt_name, 'wb') as fw:
            fw.write(fr.read(missing.start))
            fr.seek(missing.stop)
            fw.write(fr.read())
        return corrupt_name

    @pytest.mark.parametrize('frame_nr', [1, 3, 5, slice(7, 10)])
    def test_missing_frameset(self, frame_nr, tmpdir):
        if not isinstance(frame_nr, slice):
            frame_nr = slice(frame_nr, frame_nr+1)
        missing = slice(frame_nr.start * self.frameset_nbytes,
                        frame_nr.stop * self.frameset_nbytes)
        fake_file = self.fake_file(tmpdir)
        corrupt_file = self.corrupt_copy(fake_file, missing)
        with vdif.open(corrupt_file, 'rs') as fr:
            with pytest.warns(UserWarning, match='All threads'):
                data = fr.read()

        data = data.reshape((-1,) + self.data.shape)
        assert np.all(data[:frame_nr.start].astype(int) == self.data)
        assert np.all(data[frame_nr.stop:].astype(int) == self.data)
        assert np.all(data[frame_nr] == 0.)

    @pytest.mark.parametrize('frame_nr,thread', [
        (3, 0), (3, 1), (1, 1), (15, 1)])
    def test_missing_thread(self, frame_nr, thread, tmpdir):
        frame = frame_nr * self.nthread + thread
        missing = slice(frame * self.header0.frame_nbytes,
                        (frame+1) * self.header0.frame_nbytes)
        fake_file = self.fake_file(tmpdir)
        corrupt_file = self.corrupt_copy(fake_file, missing)
        with vdif.open(corrupt_file, 'rs') as fr:
            with pytest.warns(UserWarning,
                              match='Thread.*{0}.*missing'.format(thread)):
                data = fr.read()

        data = data.reshape((-1,) + self.data.shape)
        assert np.all(data[:frame_nr].astype(int) == self.data)
        assert np.all(data[frame_nr+1:].astype(int) == self.data)
        assert np.all(data[frame_nr, :, thread] == 0.)
        assert np.all(data[frame_nr, :, 1-thread].astype(int)
                      == self.data[:, 1-thread])

    @pytest.mark.parametrize('missing_bytes', [
        slice(0, 80),  # Remove whole last frame set.
        slice(0, 40),  # Remove first thread of last frame
        slice(0, 32),  # Remove first header of last frame.
        slice(16, 32),  # Corrupt first header of last frame.
        slice(0, 16),  # Corrupt first header of last frame.
        slice(0, 1),  # Corrupt header byte of last frame.
        slice(10, 11),  # Corrupt header byte of last frame.
        slice(15, 16),  # Corrupt header byte of last frame.
        slice(20, 21),  # Corrupt header byte of last frame.
        slice(23, 24),  # Corrupt header byte of last frame.
    ])
    def test_missing_end(self, missing_bytes, tmpdir):
        # In all these cases, the data read should just be short.
        missing = slice(missing_bytes.start + 15*self.frameset_nbytes,
                        missing_bytes.stop + 15*self.frameset_nbytes)
        fake_file = self.fake_file(tmpdir)
        corrupt_file = self.corrupt_copy(fake_file, missing)
        with vdif.open(corrupt_file, 'rs') as fr:
            assert fr.size == 15 * self.data.size
            data = fr.read()

        data = data.reshape((-1,) + self.data.shape)
        assert len(data) == 15
        assert np.all(data.astype(int) == self.data)

    # Note: keep frame sets 0--2 intact for get_thread_ids().
    @pytest.mark.parametrize('missing_bytes,missing_data', [
        (slice(240, 320), slice(48, 64)),  # Remove frameset 3.
        (slice(279, 281), slice(48, 64)),  # Corrupt frameset 3.
        (slice(280, 281), slice(48, 64)),  # Corrupt frameset 3, thread 1.
        (slice(279, 280), slice(48, 64)),  # Corrupt frameset 3, thread 0.
        (slice(272, 365), slice(48, 80)),  # Corrupt framesets 3, 4
    ])
    def test_missing_middle(self, missing_bytes, missing_data, tmpdir):
        # In all these cases, some data will be missing.
        fake_file = self.fake_file(tmpdir)
        corrupt_file = self.corrupt_copy(fake_file, missing_bytes)
        with vdif.open(corrupt_file, 'rs') as fr:
            assert fr.size == 16 * self.data.size
            with pytest.warns(UserWarning, match='problem loading frame set'):
                data = fr.read()

        expected = np.concatenate([self.data] * 16)
        expected[missing_data] = 0.
        assert np.all(data.astype(int) == expected)


class TestInvalidFrameHeaders:
    # CHIME VDIF files from ARO can have invalid frames included in
    # which the header information -- in particular the frame_nr and
    # seconds -- are corrupt as well.  Check that we can skip those.
    @classmethod
    def setup_class(cls):
        cls.header0 = vdif.VDIFHeader.fromvalues(
            edv=1, time=Time('2010-11-12T13:14:15'), nchan=2, bps=2,
            complex_data=False, thread_id=0, samples_per_frame=16,
            station='me', sample_rate=2*u.kHz)
        cls.nthread = 2
        cls.data = np.array([[[-1, 1],
                              [-3, 3]]]*16)
        cls.frameset_nbytes = cls.header0.frame_nbytes * cls.nthread

    def fake_file(self, tmpdir):
        filename = str(tmpdir.join('fake.vdif'))
        with vdif.open(filename, 'wb') as fw:
            for i in range(16):
                header = self.header0.copy()
                if i != 10:
                    header['frame_nr'] = i
                else:
                    header['frame_nr'] = 0
                    header['seconds'] = 0
                    header['invalid_data'] = True
                fw.write_frameset(self.data, header=header)

        return filename

    @pytest.mark.parametrize('verify', ('fix', False))
    def test_invalid_frame_fix(self, verify, tmpdir):
        fake_file = self.fake_file(tmpdir)
        with vdif.open(fake_file, 'rs', verify=verify) as fr:
            data = fr.read()

        expected = np.stack([self.data] * 16)
        expected[10] = 0
        expected.shape = (-1,) + self.data.shape[1:]
        assert np.all(data.astype(int) == expected)

    def test_invalid_frame_nofix(self, tmpdir):
        fake_file = self.fake_file(tmpdir)
        with pytest.raises(ValueError, match='wrong frame number'):
            with vdif.open(fake_file, 'rs', verify=True) as fr:
                fr.read()