import sys, os, csv, glob, random, threading, time, enum import numpy as np import librosa as lr import youtube_dl from utils.printy import print_err, print_prog #sys.path.insert(0, '../../download') from playlist import parse_list, Header from annotation import write_voiceactivity from subtitles import parse_subtitles def anno_from_file(path, n, sr, ext=''): labels = np.zeros((n,), dtype=np.int32) with open('{}{}~'.format(path, ext), newline='') as fp: rows = csv.reader(fp, delimiter=';') for row in rows: start = int(round(float(row[0]) * sr)) stop = int(round(float(row[1]) * sr)) labels[start:stop] = 1 return labels def anno_to_file(path, labels, sr): time = 0 step = 1/sr in_label = labels[0] == 1 onset = 0 count = 0 with open('{}~'.format(path), 'w', encoding='latin-1') as fp: for i in range(len(labels)): if in_label and labels[i] == 0: count += 1 fp.write('{};{};0;1.0\n'.format(onset, time)) in_label = False elif not in_label and labels[i] == 1: onset = time in_label = True time += step if in_label: count += 1 fp.write('{};{};0;1.0\n'.format(onset, time)) with open('{}'.format(path), 'w', encoding='latin-1') as fp: fp.write('<?xml version="1.0" ?>\n<annotation ssi-v="3">\n\t<info ftype="ASCII" size="{}" />\n\t<meta role="subtitles" annotator="system" />\n\t<scheme name="voiceactivity" type="DISCRETE" color="#FFDDD9C3">\n\t\t<item name="VOICE" id="0" color="#FF494429" />\n\t</scheme>\n</annotation>\n'.format(count)) def audio_dur(path, ext='', root=''): path = os.path.join(root, '{}{}'.format(path, ext)) try: return lr.get_duration(filename=path) except Exception as ex: print_err('could not read {}\n{}'.format(path, ex)) return 0 def audio_from_file(path, sr, ext='', root='', offset=0.0, duration=None): path = os.path.join(root, '{}{}'.format(path, ext)) try: audio, _ = lr.load(path, sr=sr, mono=True, offset=offset, duration=duration, dtype=np.float32, res_type='kaiser_fast') audio.shape = (-1,1) return audio except ValueError as ex: print_err('value error {}\n{}'.format(path, ex)) return [] except Exception as ex: print_err('could not read {}\n{}'.format(path, ex)) return None def audio_from_files(paths, sr, ext='', root='', duration=None, shuffle=False): if shuffle: paths = list(paths) random.shuffle(paths) audios = [] for path in paths: if duration: dur = audio_dur(path, ext=ext, root=root) audio = audio_from_file(path, sr, ext=ext, root=root, duration=duration if dur > duration else None) if audio is None: continue if len(audio): audios.append(audio) duration -= dur if duration <= 0: break else: audio = audio_from_file(path, sr, ext=ext, root=root) if audio is None: continue if len(audio): audios.append(audio) return np.concatenate(audios) if audios else None def audio_to_file(path, x, sr): lr.output.write_wav(path, x.reshape(-1), sr, norm=False) def audio_to_frames(x, n_frame, n_step=None): if n_step is None: n_step = n_frame if len(x.shape) == 1: x.shape = (-1,1) n_overlap = n_frame - n_step n_frames = (x.shape[0] - n_overlap) // n_step n_keep = n_frames * n_step + n_overlap strides = list(x.strides) strides[0] = strides[1] * n_step return np.lib.stride_tricks.as_strided(x[0:n_keep,:], (n_frames,n_frame), strides) def sample_from_file(path, sr, n_frame, n_step, audio_ext='.m4a', anno_ext='.annotation', balance=None): audio_path = '{}{}'.format(path, audio_ext) anno_path = '{}{}'.format(path, anno_ext) if not os.path.exists(audio_path) or not os.path.exists(anno_path): print_err('file not found {}[{},{}]'.format(path, audio_ext, anno_ext)) return None audio = audio_from_file(audio_path, sr) if audio is None or audio.size == 0: return None frames = audio_to_frames(audio, n_frame, n_step) labels = anno_from_file(anno_path, frames.shape[0], sr/n_step) if balance: select = get_balance_indices(labels, balance) frames = frames[select,:] labels = labels[select] return frames, labels def sample_from_url(url, sr, n_frame, n_step, balance=None): class MyLogger(object): def debug(self, msg): pass def warning(self, msg): pass def error(self, msg): print_err(msg) ydl_opts = { 'ignoreerrors' : True, 'outtmpl': r'tmp\%(id)s.%(ext)s', 'format': 'bestaudio/best', 'writesubtitles' : True, 'logger' : MyLogger() } with youtube_dl.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) if not info_dict: print_prog('skip {} [not found]'.format(url)) return None if not 'subtitles' in info_dict or not info_dict['subtitles']: print_prog('skip {} [no subtitles]') return None audio_ext = '.' + info_dict['ext'] anno_ext = '.annotation' path = os.path.join('tmp', info_dict['id']) print_prog('downloading {}'.format(path)) ydl.download((url,)) result = None if os.path.exists(path + audio_ext): sub_path = glob.glob(r'{}.*.*'.format(path)) if sub_path: print_prog('parsing {}'.format(path)) subs = parse_subtitles(sub_path[0]) if subs is not None: write_voiceactivity(path, subs) result = sample_from_file(path, sr, n_frame, n_step, audio_ext, anno_ext, balance) for file in glob.glob('{}.*'.format(path)): try: os.remove(file) except Exception as ex: print_err(ex) return result def get_files(root, filter='*', max_files=None): files = [] count = 0 for file in glob.glob(os.path.join(root, filter)): if max_files is not None and count >= max_files: break path, _ = os.path.splitext(file) files.append(path) count = count + 1 return files def get_urls(path, filter='', max_urls=None): table = parse_list(path) urls = [] for item in table: if item and filter in item[Header.Url_Untertitel]: urls.append(item[Header.Website]) if max_urls: urls = urls[:min(len(urls), max_urls)] return urls class BalanceMethod(enum.Enum): Down, Up = range(2) def get_balance_indices(labels,method): n_classes = np.max(labels)+1 select = None if method is not None: initial_n = [np.sum(labels==i) for i in range(n_classes)] if np.min(initial_n) == 0: return [] target_n = np.max(initial_n) if method == BalanceMethod.Up else np.min(initial_n) select = np.zeros((n_classes * target_n,), dtype=np.int64) for i in range(n_classes): select[i*target_n:i*target_n+target_n] = np.random.choice(np.where(labels==i)[0], target_n, replace=method == BalanceMethod.Up) np.random.shuffle(select) return select if __name__ == '__main__': # framing x = np.arange(0,10) print(x) print(audio_to_frames(x, 3)) print(audio_to_frames(x, 3, n_step=2)) print(audio_to_frames(x, 4, n_step=2)) # sample from file sr = 44100 n_frame = 44100 n_step = 44100 // 2 files = get_files('data', filter='*.m4a') frames, labels = sample_from_file(files[0], sr, n_frame, n_step, audio_ext='.m4a', anno_ext='.voiceactivity.annotation') audio_to_file(r'data\test1.wav', frames[labels == 1], sr) # sample from url urls = get_urls(r'..\youtube\filme.json', filter='ZDF') frames, labels = sample_from_url(urls[0], sr, n_frame, n_step) audio_to_file(r'data\test2.wav', frames[labels == 1], sr)