python source code of timitphonemerec

import os
import argparse
import glob
import numpy as np
import scipy.interpolate
import scipy.io.wavfile
import python_speech_features

import utils
import timit

# This is based on Table I and Section II of
# Kai-Fu Lee and Hsiao-Wuen Hon: Speaker-Independent Phone Recognition Using Hidden Markov Models.
# IEEE Transactions on Acoustics, Speech, and Signal Processing. 1989.
FOLDS = {
  'ae':  'ae',     'ah':  'ah',     'ax':  'ah',     'ax-h': 'ah',     'ao':  'ao',
  'aa':  'ao',     'aw':  'aw',     'ay':  'ay',     'b':    'b',      'ch':  'ch',
  'd':   'd',      'dh':  'dh',     'dx':  'dx',     'eh':   'eh',     'el':  'el',
  'l':   'el',     'en':  'en',     'n':   'en',     'nx':   'en',     'er':  'er',
  'axr': 'er',     'ey':  'ey',     'f':   'f',      'g':    'g',      'h#':  'h#',
  'pcl': 'h#',     'tcl': 'h#',     'kcl': 'h#',     'bcl':  'h#',     'dcl': 'h#',
  'gcl': 'h#',     'epi': 'h#',     'pau': 'h#',     'hh':   'hh',     'hv':  'hh',
  'ih':  'ih',     'ix':  'ih',     'iy':  'iy',     'jh':   'jh',     'k':   'k',
  'm':   'm',      'em':  'm',      'ng':  'ng',     'eng':  'ng',     'ow':  'ow',
  'oy':  'oy',     'p':   'p',      'q':   'q',      'r':    'r',      's':   's',
  'sh':  'sh',     'zh':  'sh',     't':   't',      'th':   'th',     'uh':  'uh',
  'uw':  'uw',     'ux':  'uw',     'v':   'v',      'w':    'w',      'y':   'y',
  'z':   'z'
}

TOKEN_VOCAB = sorted(list(set(FOLDS.values())))
assert len(TOKEN_VOCAB) == 40

DEFAULT_DATA_DIR = timit.DEFAULT_DATA_DIR


def _audio_and_labels(prefix):
  """ Load and align a TIMIT wav file with its folded phonemes.

  Returns:
    A tuple, `(audio, labels)`. A 1-D float array and a 1-D int array, both
      with the same shape.
  """

  rate, audio = scipy.io.wavfile.read(prefix + '.wav')
  if rate != timit.SAMPLE_RATE:
    raise RuntimeError('Encountered an unexpected sampling rate of %d in %s' % (rate, prefix))
  audio = np.asarray(audio, dtype=np.float)
  phoneme_data = np.loadtxt(prefix + '.phn', dtype=np.object, comments=None, delimiter=' ',
                            converters={0: int, 1: int, 2: lambda x: str(x, encoding='ascii')})

  n = np.arange(audio.size)
  labels = -1 * np.ones([audio.size], dtype=np.int8)
  for start, end, phoneme in phoneme_data:
    phoneme = FOLDS[phoneme]
    labels[(n >= start) & (n < end)], = utils.tokens_to_ids([phoneme], TOKEN_VOCAB)
  audio_start = np.min(phoneme_data[:, 0])
  audio_end = np.max(phoneme_data[:, 1])
  audio = audio[audio_start:audio_end]
  labels = labels[audio_start:audio_end]
  if any(labels == -1):
    raise RuntimeError('Encountered incomplete labeling in %s' % prefix)

  return audio, labels


def _mfcc_and_labels(audio, labels):
  """ Convert to MFCC features and corresponding (interpolated) labels.

  Returns:
    A tuple, `(mfcc_features, mfcc_labels)`. A 1-D float array and a 1-D int
      array, both with the same shape.
  """
  mfcc_sample_rate = 100.0
  winfunc = lambda x: np.hamming(x)
  mfcc_features = python_speech_features.mfcc(audio, samplerate=timit.SAMPLE_RATE, winlen=0.025,
                                              winstep=1.0/mfcc_sample_rate, lowfreq=85.0,
                                              highfreq=timit.SAMPLE_RATE/2, winfunc=winfunc)
  t_audio = np.linspace(0.0, audio.shape[0] * 1.0 / timit.SAMPLE_RATE, audio.size, endpoint=False)
  t_mfcc = np.linspace(0.0, mfcc_features.shape[0] * 1.0 / mfcc_sample_rate, mfcc_features.shape[0], endpoint=False)
  interp_func = scipy.interpolate.interp1d(t_audio, labels, kind='nearest')
  mfcc_labels = interp_func(t_mfcc)
  return mfcc_features, mfcc_labels


def load(data_dir=DEFAULT_DATA_DIR, mfcc=True):
  """ Load all standardized TIMIT data with folded phoneme labels.

  Args:
    data_dir: A string. The data directory.
    mfcc: A boolean. If True, return MFCC sequences and their corresponding
      label sequences. Otherwise, return raw audio sequences in their
      associated label sequences.

  Returns:
    A tuple with 6 elements: train inputs, train labels, val inputs,
    val labels, test inputs, test labels. Each entry is a list of sequences.
    All input sequences are 2-D float arrays with shape
    `[length, values_per_step]` and all label sequences are 1-D int8 arrays
    with shape `[length]`.
  """
  types = ['mfcc', 'mfcc_labels'] if mfcc else ['audio', 'labels']
  ret = []
  for name in ['train', 'val', 'test']:
    for type in types:
      path = os.path.join(data_dir, name + '_' + type + '.npy')
      if not os.path.exists(path):
        raise ValueError('Data not found in %s. Run timit.py and timitphonemerec.py.' % data_dir)
      data = np.load(path)
      if type == 'audio':
        data = [seq[:, np.newaxis] for seq in data]
      ret.append(data)
  return tuple(ret)


def load_split(data_dir=DEFAULT_DATA_DIR, val=True, mfcc=True, normalize=True):
  """ Load a standardized-TIMIT train, test split.

  Args:
    data_dir: A string. The data directory.
    val: A boolean. If True, return the validation set as the test set.
    mfcc: A boolean. If True, return MFCC sequences and their corresponding
      label Otherwise, return raw audio sequences in their associated
      label sequences.
    normalize: A boolean. If True, normalize each sequence individually by
      centering / scaling.

  Returns:
    A tuple, `(train_inputs, train_labels, test_inputs, test_labels)`. Each is
    a list of sequences. All inputs are 2-D float arrays with shape
    `[length, values_per_step]` and all labels are 1-D int8 arrays with shape
    `[length]`.
  """
  sequence_lists = load(data_dir=data_dir, mfcc=mfcc)
  train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = sequence_lists
  if val:
    test_inputs = val_inputs
    test_labels = val_labels
  if normalize:
    train_inputs = [seq - np.mean(seq, axis=0, keepdims=True) for seq in train_inputs]
    train_inputs = [seq / np.std(seq, axis=0, keepdims=True) for seq in train_inputs]
    test_inputs = [seq - np.mean(seq, axis=0, keepdims=True) for seq in test_inputs]
    test_inputs = [seq / np.std(seq, axis=0, keepdims=True) for seq in test_inputs]
  return train_inputs, train_labels, test_inputs, test_labels


def main():
  """ Further process and simplify standardized TIMIT. """

  description = main.__doc__
  formatter_class = argparse.ArgumentDefaultsHelpFormatter
  parser = argparse.ArgumentParser(description=description, formatter_class=formatter_class)
  parser.add_argument('--data_dir', type=str, default=DEFAULT_DATA_DIR,
                      help='''The standardized-TIMIT data directory.''')
  args = parser.parse_args()

  if not os.path.exists(args.data_dir):
    raise ValueError('%s does not exist. Did you run timit.py?' % args.data_dir)

  for name in ['train', 'val', 'test']:

    print('Processing and saving the %s set..' % name)

    pattern = os.path.join(args.data_dir, name, '*', '*.wav')
    prefixes = [path[:-4] for path in sorted(glob.glob(pattern))]
    audio_label_pairs = [_audio_and_labels(prefix) for prefix in prefixes]
    mfcc_label_pairs = [_mfcc_and_labels(*pair) for pair in audio_label_pairs]

    audio_seqs, label_seqs = zip(*audio_label_pairs)
    np.save(os.path.join(args.data_dir, name + '_audio.npy'), audio_seqs)
    np.save(os.path.join(args.data_dir, name + '_labels.npy'), label_seqs)

    mfcc_seqs, mfcc_label_seqs = zip(*mfcc_label_pairs)
    np.save(os.path.join(args.data_dir, name + '_mfcc.npy'), mfcc_seqs)
    np.save(os.path.join(args.data_dir, name + '_mfcc_labels.npy'), mfcc_label_seqs)


if __name__ == '__main__':
  main()