Python text.text_to_sequence() Examples

The following are 28 code examples of text.text_to_sequence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module text , or try the search function .
Example #1
Source File: synthesis.py    From FastSpeech with MIT License 6 votes vote down vote up
def synthesis(model, text, alpha=1.0):
    text = np.array(text_to_sequence(text, hp.text_cleaners))
    text = np.stack([text])

    src_pos = np.array([i+1 for i in range(text.shape[1])])
    src_pos = np.stack([src_pos])
    with torch.no_grad():
        sequence = torch.autograd.Variable(
            torch.from_numpy(text)).cuda().long()
        src_pos = torch.autograd.Variable(
            torch.from_numpy(src_pos)).cuda().long()

        mel, mel_postnet = model.module.forward(sequence, src_pos, alpha=alpha)

        return mel[0].cpu().transpose(0, 1), \
            mel_postnet[0].cpu().transpose(0, 1), \
            mel.transpose(1, 2), \
            mel_postnet.transpose(1, 2) 
Example #2
Source File: dataset.py    From LightSpeech with MIT License 6 votes vote down vote up
def __getitem__(self, idx):
        # mel_gt_name = os.path.join(
        #     hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1))
        # mel_gt_target = np.load(mel_gt_name)
        mel_tac2_target = np.load(os.path.join(
            hparams.mel_tacotron2, str(idx)+".npy")).T

        cemb = np.load(os.path.join(hparams.cemb_path, str(idx)+".npy"))
        D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy"))

        character = self.text[idx][0:len(self.text[idx])-1]
        character = np.array(text_to_sequence(
            character, hparams.text_cleaners))

        sample = {"text": character,
                  "mel_tac2_target": mel_tac2_target,
                  "cemb": cemb,
                  "D": D}

        return sample 
Example #3
Source File: inference.py    From LightSpeech with MIT License 6 votes vote down vote up
def synthesis(model, text, alpha=1.0):
    text = np.array(text_to_sequence(text, hp.text_cleaners))
    text = np.stack([text])
    with torch.no_grad():
        sequence = torch.autograd.Variable(
            torch.from_numpy(text)).cuda().long()
        # mel, mel_postnet_1, mel_postnet_2 = model.module.inference(
        #     sequence, alpha)
        mel = model.module.inference(sequence, alpha)

        # out = mel[0].cpu().transpose(0, 1),\
        #     mel_postnet_1[0].cpu().transpose(0, 1),\
        #     mel_postnet_2[0].cpu().transpose(0, 1),\
        #     mel.transpose(1, 2),\
        #     mel_postnet_1.transpose(1, 2),\
        #     mel_postnet_2.transpose(1, 2)

        return mel[0].cpu().transpose(0, 1), mel.transpose(1, 2) 
Example #4
Source File: utils.py    From LightSpeech with MIT License 6 votes vote down vote up
def load_data_from_tacotron2(txt, model):
    character = text.text_to_sequence(txt, hparams.text_cleaners)
    character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()

    with torch.no_grad():
        [_, mel, _, alignment], cemb = model.inference(character)

    alignment = alignment[0].cpu().numpy()
    cemb = cemb[0].cpu().numpy()

    D = get_D(alignment)
    D = np.array(D)

    mel = mel[0].cpu().numpy()

    return mel, cemb, D 
Example #5
Source File: utils.py    From LightSpeech with MIT License 6 votes vote down vote up
def load_data(txt, mel, model):
    character = text.text_to_sequence(txt, hparams.text_cleaners)
    character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()

    text_length = torch.Tensor([character.size(1)]).long().cuda()
    mel = torch.from_numpy(np.stack([mel.T])).float().cuda()
    max_len = mel.size(2)
    output_length = torch.Tensor([max_len]).long().cuda()

    inputs = character, text_length, mel, max_len, output_length

    with torch.no_grad():
        [_, mel_tacotron2, _, alignment], cemb = model.forward(inputs)

    alignment = alignment[0].cpu().numpy()
    cemb = cemb[0].cpu().numpy()

    D = get_D(alignment)
    D = np.array(D)

    mel_tacotron2 = mel_tacotron2[0].cpu().numpy()

    return mel_tacotron2, cemb, D 
Example #6
Source File: datafeeder.py    From tacotron with MIT License 6 votes vote down vote up
def _get_next_example(self):
    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
    if self._offset >= len(self._metadata):
      self._offset = 0
      random.shuffle(self._metadata)
    meta = self._metadata[self._offset]
    self._offset += 1

    text = meta[3]
    if self._cmudict and random.random() < _p_cmudict:
      text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])

    input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
    linear_target = np.load(os.path.join(self._datadir, meta[0]))
    mel_target = np.load(os.path.join(self._datadir, meta[1]))
    return (input_data, mel_target, linear_target, len(linear_target)) 
Example #7
Source File: datafeeder.py    From arabic-tacotron-tts with MIT License 6 votes vote down vote up
def _get_next_example(self):
    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
    if self._offset >= len(self._metadata):
      self._offset = 0
      random.shuffle(self._metadata)
    meta = self._metadata[self._offset]
    self._offset += 1
    text = meta[3]
    arr = []
    for word in text.split(' '):
      if word in [" ", ""]:
        pass
      elif word in [",", '.', '-']:
        x = word
        arr.append(x)
      else:
        x = self._maybe_get_arpabet(word)
        arr.append(x)
    text = ' '.join(arr)

    input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
    linear_target = np.load(os.path.join(self._datadir, meta[0]))
    mel_target = np.load(os.path.join(self._datadir, meta[1]))
    return (input_data, mel_target, linear_target, len(linear_target)) 
Example #8
Source File: utils.py    From FastSpeech with MIT License 6 votes vote down vote up
def load_data(txt, mel, model):
    character = text.text_to_sequence(txt, hparams.text_cleaners)
    character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()

    text_length = torch.Tensor([character.size(1)]).long().cuda()
    mel = torch.from_numpy(np.stack([mel.T])).float().cuda()
    max_len = mel.size(2)
    output_length = torch.Tensor([max_len]).long().cuda()

    inputs = character, text_length, mel, max_len, output_length

    with torch.no_grad():
        [_, mel_tacotron2, _, alignment], cemb = model.forward(inputs)

    alignment = alignment[0].cpu().numpy()
    cemb = cemb[0].cpu().numpy()

    D = get_D(alignment)
    D = np.array(D)

    mel_tacotron2 = mel_tacotron2[0].cpu().numpy()

    return mel_tacotron2, cemb, D 
Example #9
Source File: datafeeder.py    From libfaceid with MIT License 6 votes vote down vote up
def _get_next_example(self):
    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
    if self._offset >= len(self._metadata):
      self._offset = 0
      random.shuffle(self._metadata)
    meta = self._metadata[self._offset]
    self._offset += 1

    text = meta[3]
    if self._cmudict and random.random() < _p_cmudict:
      text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])

    input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
    linear_target = np.load(os.path.join(self._datadir, meta[0]))
    mel_target = np.load(os.path.join(self._datadir, meta[1]))
    return (input_data, mel_target, linear_target, len(linear_target)) 
Example #10
Source File: utils.py    From FastSpeech with MIT License 6 votes vote down vote up
def load_data_from_tacotron2(txt, model):
    character = text.text_to_sequence(txt, hparams.text_cleaners)
    character = torch.from_numpy(np.stack([np.array(character)])).long().cuda()

    with torch.no_grad():
        [_, mel, _, alignment], cemb = model.inference(character)

    alignment = alignment[0].cpu().numpy()
    cemb = cemb[0].cpu().numpy()

    D = get_D(alignment)
    D = np.array(D)

    mel = mel[0].cpu().numpy()

    return mel, cemb, D 
Example #11
Source File: mkgta.py    From Tacotron2-PyTorch with MIT License 6 votes vote down vote up
def infer(wav_path, text, model):
	sequence = text_to_sequence(text, hps.text_cleaners)
	sequence = to_var(torch.IntTensor(sequence)[None, :]).long()
	mel = melspectrogram(load_wav(wav_path))
	mel_in = to_var(torch.Tensor([mel]))
	r = mel_in.shape[2]%hps.n_frames_per_step
	if r != 0:
		mel_in = mel_in[:, :, :-r]
	sequence = torch.cat([sequence, sequence], 0)
	mel_in = torch.cat([mel_in, mel_in], 0)
	_, mel_outputs_postnet, _, _ = model.teacher_infer(sequence, mel_in)
	ret = mel
	if r != 0:
		ret[:, :-r] = to_arr(mel_outputs_postnet[0])
	else:
		ret = to_arr(mel_outputs_postnet[0])
	return ret 
Example #12
Source File: datafeeder.py    From vae_tacotron with MIT License 6 votes vote down vote up
def _get_next_example(self):
    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
    if self._offset >= len(self._metadata):
      self._offset = 0
      random.shuffle(self._metadata)
    meta = self._metadata[self._offset]
    self._offset += 1

    text = meta[3]
    if self._cmudict and random.random() < _p_cmudict:
      text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])

    input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32)
    linear_target = np.load(os.path.join(self._datadir, meta[0]))
    mel_target = np.load(os.path.join(self._datadir, meta[1]))

    return (input_data, mel_target, linear_target, len(linear_target)) 
Example #13
Source File: synthesis.py    From Tacotron-pytorch with Apache License 2.0 6 votes vote down vote up
def generate(model, text):

    # Text to index sequence
    cleaner_names = [x.strip() for x in hp.cleaners.split(',')]
    seq = np.expand_dims(np.asarray(text_to_sequence(text, cleaner_names), dtype=np.int32), axis=0)

    # Provide [GO] Frame
    mel_input = np.zeros([seq.shape[0], hp.num_mels, 1], dtype=np.float32)

    # Variables
    characters = Variable(torch.from_numpy(seq).type(torch.cuda.LongTensor), volatile=True).cuda()
    mel_input = Variable(torch.from_numpy(mel_input).type(torch.cuda.FloatTensor), volatile=True).cuda()

    # Spectrogram to wav
    _, linear_output = model.forward(characters, mel_input)
    wav = inv_spectrogram(linear_output[0].data.cpu().numpy())
    wav = wav[:find_endpoint(wav)]
    out = io.BytesIO()
    save_wav(wav, out)

    return out.getvalue() 
Example #14
Source File: inference.py    From Tacotron2-PyTorch with MIT License 5 votes vote down vote up
def infer(text, model):
	sequence = text_to_sequence(text, hps.text_cleaners)
	sequence = to_var(torch.IntTensor(sequence)[None, :]).long()
	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
	return (mel_outputs, mel_outputs_postnet, alignments) 
Example #15
Source File: dataset.py    From FastSpeech with MIT License 5 votes vote down vote up
def __getitem__(self, idx):
        mel_gt_name = os.path.join(
            hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (idx+1))
        mel_gt_target = np.load(mel_gt_name)
        D = np.load(os.path.join(hparams.alignment_path, str(idx)+".npy"))

        character = self.text[idx][0:len(self.text[idx])-1]
        character = np.array(text_to_sequence(
            character, hparams.text_cleaners))

        sample = {"text": character,
                  "mel_target": mel_gt_target,
                  "D": D}

        return sample 
Example #16
Source File: text_test.py    From tacotron with MIT License 5 votes vote down vote up
def test_text_to_sequence():
  assert text_to_sequence('', []) == [1]
  assert text_to_sequence('Hi!', []) == [9, 36, 54, 1]
  assert text_to_sequence('"A"_B', []) == [2, 3, 1]
  assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
  assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
  assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] 
Example #17
Source File: synthesizer.py    From tacotron with MIT License 5 votes vote down vote up
def synthesize(self, text):
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    seq = text_to_sequence(text, cleaner_names)
    feed_dict = {
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
    }
    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
    wav = audio.inv_preemphasis(wav)
    wav = wav[:audio.find_endpoint(wav)]
    out = io.BytesIO()
    audio.save_wav(wav, out)
    return out.getvalue() 
Example #18
Source File: dataset.py    From Tacotron2-PyTorch with MIT License 5 votes vote down vote up
def get_text(self, text):
		text_norm = torch.IntTensor(text_to_sequence(text, hps.text_cleaners))
		return text_norm 
Example #19
Source File: train_tacotron.py    From Tacotron-Wavenet-Vocoder-Korean with MIT License 5 votes vote down vote up
def create_batch_inputs_from_texts(texts):
    sequences = [text_to_sequence(text) for text in texts]

    inputs = _prepare_inputs(sequences)
    input_lengths = np.asarray([len(x) for x in inputs], dtype=np.int32)

    for idx, (seq, text) in enumerate(zip(inputs, texts)):
        recovered_text = sequence_to_text(seq, skip_eos_and_pad=True)
        if recovered_text != h2j(text):
            log(" [{}] {}".format(idx, text))
            log(" [{}] {}".format(idx, recovered_text))
            log("="*30)

    return inputs, input_lengths 
Example #20
Source File: wavloader.py    From MelNet with MIT License 5 votes vote down vote up
def __getitem__(self, idx):
        text = self.dataset[idx][1]
        if self.hp.data.name == 'KSS':
            seq = text_to_sequence(text)
        elif self.hp.data.name == 'Blizzard':
            seq = process_blizzard(text)

        wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr)
        # wav = cut_wav(self.wavlen, wav)
        mel = self.melgen.get_normalized_mel(wav)
        source, target = self.tierutil.cut_divide_tiers(mel, self.tier)

        return seq, source, target 
Example #21
Source File: model.py    From MelNet with MIT License 5 votes vote down vote up
def sample(self, condition):
        x = None
        seq = torch.from_numpy(text_to_sequence(condition)).long().unsqueeze(0)
        input_lengths = torch.LongTensor([seq[0].shape[0]]).cuda()
        audio_lengths = torch.LongTensor([0]).cuda()

        ## Tier 1 ##
        tqdm.write('Tier 1')
        for t in tqdm(range(self.args.timestep // self.t_div)):
            audio_lengths += 1
            if x is None:
                x = torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()
            else:
                x = torch.cat([x, torch.zeros((1, self.n_mels // self.f_div, 1)).cuda()], dim=-1)
            for m in tqdm(range(self.n_mels // self.f_div)):
                torch.cuda.synchronize()
                if self.infer_hp.conditional:
                    mu, std, pi, _ = self.tiers[1](x, seq, input_lengths, audio_lengths)
                else:
                    mu, std, pi = self.tiers[1](x, audio_lengths)
                temp = sample_gmm(mu, std, pi)
                x[:, m, t] = temp[:, m, t]

        ## Tier 2~N ##
        for tier in tqdm(range(2, self.hp.model.tier + 1)):
            tqdm.write('Tier %d' % tier)
            mu, std, pi = self.tiers[tier](x)
            temp = sample_gmm(mu, std, pi)
            x = self.tierutil.interleave(x, temp, tier + 1)

        return x 
Example #22
Source File: text_test.py    From arabic-tacotron-tts with MIT License 5 votes vote down vote up
def test_text_to_sequence():
  assert text_to_sequence('', []) == [1]
  assert text_to_sequence('{t a s d ii0 d a t i1 n}', []) == [49, 29, 48, 32, 38, 32, 29, 49, 37, 44, 1]
  assert text_to_sequence('{t a s d ii0 d a t i1 n} {s t a E S A t}', ['lowercase']) == [49, 29, 48, 32, 38, 32, 29, 49, 37, 44, 11, 48, 49, 29, 18, 22, 15, 49, 1]
  assert text_to_sequence('{t a s d ii0 d a t i1 n} {s t a E S A t}', ['english_cleaners']) == [49, 29, 48, 32, 38, 32, 29, 49, 37, 44, 11, 48, 49, 29, 18, 22, 15, 49, 1]
  assert text_to_sequence('{t a s d ii0 d a t i1 n} {s t a E S A t}', ['arabic_cleaners']) == [49, 29, 48, 32, 38, 32, 29, 49, 37, 44, 11, 48, 49, 29, 18, 22, 15, 49, 1]
  # assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
  # assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1]


# def test_sequence_to_text():
#   assert sequence_to_text([]) == ''
#   assert sequence_to_text([1]) == '~'
#   assert sequence_to_text([9, 36, 54, 1]) == 'Hi!~'
#   assert sequence_to_text([2, 64, 83, 132, 64, 3]) == 'A {AW1 S} B' 
Example #23
Source File: synthesizer.py    From arabic-tacotron-tts with MIT License 5 votes vote down vote up
def synthesize(self, text):
    text = arpa.to_arpa(text)
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    seq = text_to_sequence(text, cleaner_names)
    feed_dict = {
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
    }
    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
    wav = audio.inv_preemphasis(wav)
    wav = wav[:audio.find_endpoint(wav)]
    out = io.BytesIO()
    audio.save_wav(wav, out)
    return out.getvalue() 
Example #24
Source File: text_test.py    From libfaceid with MIT License 5 votes vote down vote up
def test_text_to_sequence():
  assert text_to_sequence('', []) == [1]
  assert text_to_sequence('Hi!', []) == [9, 36, 54, 1]
  assert text_to_sequence('"A"_B', []) == [2, 3, 1]
  assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
  assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
  assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] 
Example #25
Source File: preprocess.py    From Transformer-TTS with MIT License 5 votes vote down vote up
def __getitem__(self, idx):
        wav_name = os.path.join(self.root_dir, self.landmarks_frame.ix[idx, 0]) + '.wav'
        text = self.landmarks_frame.ix[idx, 1]

        text = np.asarray(text_to_sequence(text, [hp.cleaners]), dtype=np.int32)
        mel = np.load(wav_name[:-4] + '.pt.npy')
        mel_input = np.concatenate([np.zeros([1,hp.num_mels], np.float32), mel[:-1,:]], axis=0)
        text_length = len(text)
        pos_text = np.arange(1, text_length + 1)
        pos_mel = np.arange(1, mel.shape[0] + 1)

        sample = {'text': text, 'mel': mel, 'text_length':text_length, 'mel_input':mel_input, 'pos_mel':pos_mel, 'pos_text':pos_text}

        return sample 
Example #26
Source File: synthesis.py    From Transformer-TTS with MIT License 5 votes vote down vote up
def synthesis(text, args):
    m = Model()
    m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(args.restore_step1, "transformer"))
    m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1,1, 80]).cuda()
    pos_text = t.arange(1, text.size(1)+1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m=m.cuda()
    m_post = m_post.cuda()
    m.train(False)
    m_post.train(False)
    
    pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for i in pbar:
            pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([mel_input, postnet_pred[:,-1:,:]], dim=1)

        mag_pred = m_post.forward(postnet_pred)
        
    wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    write(hp.sample_path + "/test.wav", hp.sr, wav) 
Example #27
Source File: text_test.py    From vae_tacotron with MIT License 5 votes vote down vote up
def test_text_to_sequence():
  assert text_to_sequence('', []) == [1]
  assert text_to_sequence('Hi!', []) == [9, 36, 54, 1]
  assert text_to_sequence('"A"_B', []) == [2, 3, 1]
  assert text_to_sequence('A {AW1 S} B', []) == [2, 64, 83, 132, 64, 3, 1]
  assert text_to_sequence('Hi', ['lowercase']) == [35, 36, 1]
  assert text_to_sequence('A {AW1 S}  B', ['english_cleaners']) == [28, 64, 83, 132, 64, 29, 1] 
Example #28
Source File: synthesizer.py    From vae_tacotron with MIT License 5 votes vote down vote up
def synthesize(self, text, reference_mel):
    cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
    seq = text_to_sequence(text, cleaner_names)
    feed_dict = {
      self.model.inputs: [np.asarray(seq, dtype=np.int32)],
      self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
      self.model.reference_mel: [np.asarray(reference_mel, dtype=np.float32)]
    }
    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
    wav = audio.inv_preemphasis(wav)
    wav = wav[:audio.find_endpoint(wav)]
    out = io.BytesIO()
    audio.save_wav(wav, out)
    return out.getvalue()