import cv2 import torch import random import librosa import numpy as np from src.random_resized_crop import RandomResizedCrop cv2.setNumThreads(0) def image_crop(image, bbox): return image[bbox[1]:bbox[3], bbox[0]:bbox[2]] def gauss_noise(image, sigma_sq): h, w = image.shape gauss = np.random.normal(0, sigma_sq, (h, w)) gauss = gauss.reshape(h, w) image = image + gauss return image # Source: https://www.kaggle.com/davids1992/specaugment-quick-implementation def spec_augment(spec: np.ndarray, num_mask=2, freq_masking=0.15, time_masking=0.20, value=0): spec = spec.copy() num_mask = random.randint(1, num_mask) for i in range(num_mask): all_freqs_num, all_frames_num = spec.shape freq_percentage = random.uniform(0.0, freq_masking) num_freqs_to_mask = int(freq_percentage * all_freqs_num) f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask) f0 = int(f0) spec[f0:f0 + num_freqs_to_mask, :] = value time_percentage = random.uniform(0.0, time_masking) num_frames_to_mask = int(time_percentage * all_frames_num) t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask) t0 = int(t0) spec[:, t0:t0 + num_frames_to_mask] = value return spec class SpecAugment: def __init__(self, num_mask=2, freq_masking=0.15, time_masking=0.20): self.num_mask = num_mask self.freq_masking = freq_masking self.time_masking = time_masking def __call__(self, image): return spec_augment(image, self.num_mask, self.freq_masking, self.time_masking, image.min()) class Compose: def __init__(self, transforms): self.transforms = transforms def __call__(self, image, trg=None): if trg is None: for t in self.transforms: image = t(image) return image else: for t in self.transforms: image, trg = t(image, trg) return image, trg class UseWithProb: def __init__(self, transform, prob=.5): self.transform = transform self.prob = prob def __call__(self, image, trg=None): if trg is None: if random.random() < self.prob: image = self.transform(image) return image else: if random.random() < self.prob: image, trg = self.transform(image, trg) return image, trg class OneOf: def __init__(self, transforms, p=None): self.transforms = transforms self.p = p def __call__(self, image, trg=None): transform = np.random.choice(self.transforms, p=self.p) if trg is None: image = transform(image) return image else: image, trg = transform(image, trg) return image, trg class Flip: def __init__(self, flip_code): assert flip_code == 0 or flip_code == 1 self.flip_code = flip_code def __call__(self, image): image = cv2.flip(image, self.flip_code) return image class HorizontalFlip(Flip): def __init__(self): super().__init__(1) class VerticalFlip(Flip): def __init__(self): super().__init__(0) class GaussNoise: def __init__(self, sigma_sq): self.sigma_sq = sigma_sq def __call__(self, image): if self.sigma_sq > 0.0: image = gauss_noise(image, np.random.uniform(0, self.sigma_sq)) return image class RandomGaussianBlur: '''Apply Gaussian blur with random kernel size Args: max_ksize (int): maximal size of a kernel to apply, should be odd sigma_x (int): Standard deviation ''' def __init__(self, max_ksize=5, sigma_x=20): assert max_ksize % 2 == 1, "max_ksize should be odd" self.max_ksize = max_ksize // 2 + 1 self.sigma_x = sigma_x def __call__(self, image): kernel_size = tuple(2 * np.random.randint(0, self.max_ksize, 2) + 1) blured_image = cv2.GaussianBlur(image, kernel_size, self.sigma_x) return blured_image class ImageToTensor: def __call__(self, image): delta = librosa.feature.delta(image) accelerate = librosa.feature.delta(image, order=2) image = np.stack([image, delta, accelerate], axis=0) image = image.astype(np.float32) / 100 image = torch.from_numpy(image) return image class RandomCrop: def __init__(self, size): self.size = size def __call__(self, signal): start = random.randint(0, signal.shape[1] - self.size) return signal[:, start: start + self.size] class CenterCrop: def __init__(self, size): self.size = size def __call__(self, signal): if signal.shape[1] > self.size: start = (signal.shape[1] - self.size) // 2 return signal[:, start: start + self.size] else: return signal class PadToSize: def __init__(self, size, mode='constant'): assert mode in ['constant', 'wrap'] self.size = size self.mode = mode def __call__(self, signal): if signal.shape[1] < self.size: padding = self.size - signal.shape[1] offset = padding // 2 pad_width = ((0, 0), (offset, padding - offset)) if self.mode == 'constant': signal = np.pad(signal, pad_width, 'constant', constant_values=signal.min()) else: signal = np.pad(signal, pad_width, 'wrap') return signal def get_transforms(train, size, wrap_pad_prob=0.5, resize_scale=(0.8, 1.0), resize_ratio=(1.7, 2.3), resize_prob=0.33, spec_num_mask=2, spec_freq_masking=0.15, spec_time_masking=0.20, spec_prob=0.5): if train: transforms = Compose([ OneOf([ PadToSize(size, mode='wrap'), PadToSize(size, mode='constant'), ], p=[wrap_pad_prob, 1 - wrap_pad_prob]), RandomCrop(size), UseWithProb( RandomResizedCrop(scale=resize_scale, ratio=resize_ratio), prob=resize_prob ), UseWithProb(SpecAugment(num_mask=spec_num_mask, freq_masking=spec_freq_masking, time_masking=spec_time_masking), spec_prob), ImageToTensor() ]) else: transforms = Compose([ PadToSize(size), CenterCrop(size), ImageToTensor() ]) return transforms