Python torchtext.vocab.GloVe() Examples

The following are 18 code examples of torchtext.vocab.GloVe(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torchtext.vocab , or try the search function .
Example #1
Source File: dataset.py    From controlled-text-generation with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, emb_dim=50, mbsize=32):
        self.TEXT = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16)
        self.LABEL = data.Field(sequential=False, unk_token=None)

        # Only take sentences with length <= 15
        f = lambda ex: len(ex.text) <= 15 and ex.label != 'neutral'

        train, val, test = datasets.SST.splits(
            self.TEXT, self.LABEL, fine_grained=False, train_subtrees=False,
            filter_pred=f
        )

        self.TEXT.build_vocab(train, vectors=GloVe('6B', dim=emb_dim))
        self.LABEL.build_vocab(train)

        self.n_vocab = len(self.TEXT.vocab.itos)
        self.emb_dim = emb_dim

        self.train_iter, self.val_iter, _ = data.BucketIterator.splits(
            (train, val, test), batch_size=mbsize, device=-1,
            shuffle=True, repeat=True
        )
        self.train_iter = iter(self.train_iter)
        self.val_iter = iter(self.val_iter) 
Example #2
Source File: test_vocab.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_vocab_download_glove_vectors(self):
        c = Counter({'hello': 4, 'world': 3, 'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T': 5, 'freq_too_low': 2})

        # Build a vocab and get vectors twice to test caching, then once more
        # to test string aliases.
        for i in range(3):
            if i == 2:
                vectors = "glove.twitter.27B.25d"
            else:
                vectors = GloVe(name='twitter.27B', dim='25')
            v = vocab.Vocab(c, min_freq=3, specials=['<unk>', '<pad>', '<bos>'],
                            vectors=vectors)

            expected_itos = ['<unk>', '<pad>', '<bos>',
                             'ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T', 'hello', 'world']
            expected_stoi = {x: index for index, x in enumerate(expected_itos)}
            self.assertEqual(v.itos, expected_itos)
            self.assertEqual(dict(v.stoi), expected_stoi)

            vectors = v.vectors.numpy()

            # The first 5 entries in each vector.
            expected_twitter = {
                'hello': [-0.77069, 0.12827, 0.33137, 0.0050893, -0.47605],
                'world': [0.10301, 0.095666, -0.14789, -0.22383, -0.14775],
            }

            for word in expected_twitter:
                assert_allclose(vectors[v.stoi[word], :5],
                                expected_twitter[word])

            assert_allclose(vectors[v.stoi['<unk>']], np.zeros(25))
            assert_allclose(vectors[v.stoi['OOV token']], np.zeros(25))
        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            zip_file = os.path.join(self.project_root, ".vector_cache",
                                    "glove.twitter.27B.zip")
            conditional_remove(zip_file)
            for dim in ["25", "50", "100", "200"]:
                conditional_remove(os.path.join(self.project_root, ".vector_cache",
                                   "glove.twitter.27B.{}d.txt".format(dim))) 
Example #3
Source File: utils.py    From TextClassificationBenchmark with MIT License 5 votes vote down vote up
def loadData(opt):
    if not opt.from_torchtext:
        import dataHelper as helper
        return helper.loadData(opt)
    device = 0 if  torch.cuda.is_available()  else -1

    TEXT = data.Field(lower=True, include_lengths=True, batch_first=True,fix_length=opt.max_seq_len)
    LABEL = data.Field(sequential=False)
    if opt.dataset=="imdb":
        train, test = datasets.IMDB.splits(TEXT, LABEL)
    elif opt.dataset=="sst":
        train, val, test = datasets.SST.splits( TEXT, LABEL, fine_grained=True, train_subtrees=True,
                                               filter_pred=lambda ex: ex.label != 'neutral')
    elif opt.dataset=="trec":
        train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)
    else:
        print("does not support this datset")
        
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train)    
    # print vocab information
    print('len(TEXT.vocab)', len(TEXT.vocab))
    print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

    train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=opt.batch_size,device=device,repeat=False,shuffle=True)

    opt.label_size= len(LABEL.vocab)    
    opt.vocab_size = len(TEXT.vocab)
    opt.embedding_dim= TEXT.vocab.vectors.size()[1]
    opt.embeddings = TEXT.vocab.vectors
    
    return train_iter, test_iter 
Example #4
Source File: load_data.py    From Text-Classification-Pytorch with MIT License 5 votes vote down vote up
def load_dataset(test_sen=None):

    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.
                 
    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
                  
    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.
    
    """
    
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = data.LabelField(tensor_type=torch.FloatTensor)
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data
    train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    '''Alternatively we can also use the default configurations'''
    # train_iter, test_iter = datasets.IMDB.iters(batch_size=32)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter 
Example #5
Source File: updown_captioner.py    From updown-baseline with MIT License 5 votes vote down vote up
def _initialize_glove(self) -> torch.Tensor:
        r"""
        Initialize embeddings of all the tokens in a given
        :class:`~allennlp.data.vocabulary.Vocabulary` by their GloVe vectors.

        Extended Summary
        ----------------
        It is recommended to train an :class:`~updown.models.updown_captioner.UpDownCaptioner` with
        frozen word embeddings when one wishes to perform Constrained Beam Search decoding during
        inference. This is because the constraint words may not appear in caption vocabulary (out of
        domain), and their embeddings will never be updated during training. Initializing with frozen
        GloVe embeddings is helpful, because they capture more meaningful semantics than randomly
        initialized embeddings.

        Returns
        -------
        torch.Tensor
            GloVe Embeddings corresponding to tokens.
        """
        glove = GloVe(name="42B", dim=300)
        glove_vectors = torch.zeros(self._vocabulary.get_vocab_size(), 300)

        for word, i in self._vocabulary.get_token_to_index_vocabulary().items():
            if word in glove.stoi:
                glove_vectors[i] = glove.vectors[glove.stoi[word]]
            elif word != self._pad_index:
                # Initialize by random vector.
                glove_vectors[i] = 2 * torch.randn(300) - 1

        return glove_vectors 
Example #6
Source File: torch_agent.py    From KBRD with MIT License 5 votes vote down vote up
def _get_embtype(self, emb_type):
        # set up preinitialized embeddings
        try:
            import torchtext.vocab as vocab
        except ImportError as ex:
            print('Please install torch text with `pip install torchtext`')
            raise ex
        pretrained_dim = 300
        if emb_type.startswith('glove'):
            if 'twitter' in emb_type:
                init = 'glove-twitter'
                name = 'twitter.27B'
                pretrained_dim = 200
            else:
                init = 'glove'
                name = '840B'
            embs = vocab.GloVe(
                name=name, dim=pretrained_dim,
                cache=modelzoo_path(self.opt.get('datapath'),
                                    'models:glove_vectors'))
        elif emb_type.startswith('fasttext_cc'):
            init = 'fasttext_cc'
            from parlai.zoo.fasttext_cc_vectors.build import download
            embs = download(self.opt.get('datapath'))
        elif emb_type.startswith('fasttext'):
            init = 'fasttext'
            from parlai.zoo.fasttext_vectors.build import download
            embs = download(self.opt.get('datapath'))
        else:
            raise RuntimeError('embedding type {} not implemented. check arg, '
                               'submit PR to this function, or override it.'
                               ''.format(emb_type))
        return embs, init 
Example #7
Source File: arora.py    From neural_chat with MIT License 5 votes vote down vote up
def get_word_sims(self, sent, sent_emb, dictionary):
        """
        Given a sentence and its Arora-style sentence embedding, compute the cosine
        similarities to it, for all words in the dictionary.

        Inputs:
          sent: string. Used only for caching lookup purposes.
          sent_emb: torch Tensor shape (glove_dim).
          dictionary: ParlAI dictionary

        Returns:
          sims: torch Tensor shape (vocab_size), containing the cosine sims.
        """
        # If we haven't initialized the GloVe emb matrix yet, do so
        if self.emb_matrix is None:
            self.get_emb_matrix(dictionary)

        # If we have already computed sims for this sentence, return it
        if sent in self.cache_sent2sims:
            sims = self.cache_sent2sims[sent]
            return sims

        # Compute the cosine similarities. Implementation from here:
        #  https://codereview.stackexchange.com/questions/55717/efficient-numpy-cosine-distance-calculation
        dotted = self.emb_matrix.dot(sent_emb)  # shape (vocab_size)
        sent_emb_norm = np.linalg.norm(sent_emb)  # norm of the sent emb. scalar
        norms = np.multiply(self.emb_matrix_norm, sent_emb_norm)  # shape (vocab_size)
        sims = np.divide(dotted, norms)  # divide dot prods by norms. shape (vocab_size)
        sims = torch.tensor(sims)  # convert to torch Tensor, shape (vocab_size)

        # Cache sims in self.cache_sent2sims
        self.cache_sentqueue.append(sent)  # append sent to right
        self.cache_sent2sims[sent] = sims  # add (sent, sims) pair to cache
        if len(self.cache_sentqueue) > self.cache_limit:
            to_remove = self.cache_sentqueue.popleft()  # remove from left
            del self.cache_sent2sims[to_remove]  # remove from cache
        assert len(self.cache_sent2sims) == len(self.cache_sentqueue)
        assert len(self.cache_sent2sims) <= self.cache_limit

        return sims 
Example #8
Source File: arora.py    From neural_chat with MIT License 5 votes vote down vote up
def get_emb_matrix(self, dictionary):
        """
        Construct an embedding matrix containing pretrained GloVe vectors for all words
        in dictionary, and store in self.emb_matrix. This is needed for
        response-relatedness weighted decoding.

        Inputs:
          dictionary: ParlAI dictionary
        """
        print(
            'Constructing GloVe emb matrix for response-relatedness weighted '
            'decoding...'
        )
        self.emb_matrix = []
        oov_indices = []  # list of dictionary indices for all OOV words
        for idx in range(len(dictionary)):
            word = dictionary[idx]
            if word in self.tt_embs.stoi:
                word_emb = self.tt_embs.vectors[self.tt_embs.stoi[word]]
            else:
                # If word is OOV, enter a zero vector instead.
                # This means that the cosine similarity will always be zero.
                word_emb = torch.zeros(self.glove_dim)
                oov_indices.append(idx)
            self.emb_matrix.append(word_emb)
        self.emb_matrix = np.stack(self.emb_matrix)  # (vocab_size, glove_dim)
        print(
            'Done constructing GloVe emb matrix; found %i OOVs of %i words'
            % (len(oov_indices), len(dictionary))
        )

        # Get the norm of each of the word vectors. This is needed for cosine sims.
        # self.emb_matrix_norm is a np array shape (vocab_size)
        self.emb_matrix_norm = np.linalg.norm(self.emb_matrix, axis=1)

        # For the OOV words which have zero vectors,
        # set the norm to 1.0 so we don't have divide-by-zero errors
        for idx in oov_indices:
            self.emb_matrix_norm[idx] = 1.0 
Example #9
Source File: arora.py    From neural_chat with MIT License 5 votes vote down vote up
def get_glove_embs(self):
        """
        Loads torchtext GloVe embs from file and stores in self.tt_embs.
        """
        print('Loading torchtext GloVe embs (for Arora sentence embs)...')
        self.tt_embs = vocab.GloVe(
            name=self.glove_name, dim=self.glove_dim, cache=self.glove_cache
        )
        print('Finished loading torchtext GloVe embs') 
Example #10
Source File: arora.py    From neural_chat with MIT License 5 votes vote down vote up
def __init__(
        self, word2prob, arora_a, glove_name, glove_dim, first_sv, glove_cache
    ):
        """
          Inputs:
            word2prob: dict mapping words to their unigram probs
            arora_a: a float. Is the constant (called "a" in the paper)
              used to compute Arora sentence embeddings.
            glove_name: the version of GloVe to use, e.g. '840B'
            glove_dim: the dimension of the GloVe embeddings to use, e.g. 300
            first_sv: np array shape (glove_dim). The first singular value,
              used to compute Arora sentence embeddings. Can be None.
            glove_cache: The path to where the glove vectors are stored.
        """
        self.word2prob = word2prob
        self.arora_a = arora_a
        self.glove_name = glove_name
        self.glove_dim = glove_dim
        self.glove_cache = glove_cache
        self.first_sv = first_sv
        if self.first_sv is not None:
            self.first_sv = torch.tensor(self.first_sv)  # convert to torch tensor

        self.min_word_prob = min(word2prob.values())  # prob of rarest word
        self.tt_embs = None  # will be torchtext.vocab.GloVe object
        self.emb_matrix = None  # will be np array shape (vocab_size, glove_dim)

        # Initialize a cache, which holds up to 64 sentences, along with their
        # corresponding word similarity scores (i.e. cosine sim for every word in the
        # vocab). This enables us to repeatedly retrieve sims for sentences we have
        # already processed (useful for batched beam search).
        self.cache_limit = 64
        self.cache_sent2sims = {}  # maps sent to sims. holds up to cache_limit.
        self.cache_sentqueue = deque()  # list of sents. add to right, remove from left 
Example #11
Source File: arora.py    From ParlAI with MIT License 5 votes vote down vote up
def get_word_sims(self, sent, sent_emb, dictionary):
        """
        Given a sentence and its Arora-style sentence embedding, compute the cosine
        similarities to it, for all words in the dictionary.

        Inputs:
          sent: string. Used only for caching lookup purposes.
          sent_emb: torch Tensor shape (glove_dim).
          dictionary: ParlAI dictionary

        Returns:
          sims: torch Tensor shape (vocab_size), containing the cosine sims.
        """
        # If we haven't initialized the GloVe emb matrix yet, do so
        if self.emb_matrix is None:
            self.get_emb_matrix(dictionary)

        # If we have already computed sims for this sentence, return it
        if sent in self.cache_sent2sims:
            sims = self.cache_sent2sims[sent]
            return sims

        # Compute the cosine similarities. Implementation from here:
        #  https://codereview.stackexchange.com/questions/55717/efficient-numpy-cosine-distance-calculation
        dotted = self.emb_matrix.dot(sent_emb)  # shape (vocab_size)
        sent_emb_norm = np.linalg.norm(sent_emb)  # norm of the sent emb. scalar
        norms = np.multiply(self.emb_matrix_norm, sent_emb_norm)  # shape (vocab_size)
        sims = np.divide(dotted, norms)  # divide dot prods by norms. shape (vocab_size)
        sims = torch.tensor(sims)  # convert to torch Tensor, shape (vocab_size)

        # Cache sims in self.cache_sent2sims
        self.cache_sentqueue.append(sent)  # append sent to right
        self.cache_sent2sims[sent] = sims  # add (sent, sims) pair to cache
        if len(self.cache_sentqueue) > self.cache_limit:
            to_remove = self.cache_sentqueue.popleft()  # remove from left
            del self.cache_sent2sims[to_remove]  # remove from cache
        assert len(self.cache_sent2sims) == len(self.cache_sentqueue)
        assert len(self.cache_sent2sims) <= self.cache_limit

        return sims 
Example #12
Source File: arora.py    From ParlAI with MIT License 5 votes vote down vote up
def get_emb_matrix(self, dictionary):
        """
        Construct an embedding matrix containing pretrained GloVe vectors for all words
        in dictionary, and store in self.emb_matrix. This is needed for response-
        relatedness weighted decoding.

        Inputs:
          dictionary: ParlAI dictionary
        """
        print(
            'Constructing GloVe emb matrix for response-relatedness weighted '
            'decoding...'
        )
        self.emb_matrix = []
        oov_indices = []  # list of dictionary indices for all OOV words
        for idx in range(len(dictionary)):
            word = dictionary[idx]
            if word in self.tt_embs.stoi:
                word_emb = self.tt_embs.vectors[self.tt_embs.stoi[word]]
            else:
                # If word is OOV, enter a zero vector instead.
                # This means that the cosine similarity will always be zero.
                word_emb = torch.zeros(self.glove_dim)
                oov_indices.append(idx)
            self.emb_matrix.append(word_emb)
        self.emb_matrix = np.stack(self.emb_matrix)  # (vocab_size, glove_dim)
        print(
            'Done constructing GloVe emb matrix; found %i OOVs of %i words'
            % (len(oov_indices), len(dictionary))
        )

        # Get the norm of each of the word vectors. This is needed for cosine sims.
        # self.emb_matrix_norm is a np array shape (vocab_size)
        self.emb_matrix_norm = np.linalg.norm(self.emb_matrix, axis=1)

        # For the OOV words which have zero vectors,
        # set the norm to 1.0 so we don't have divide-by-zero errors
        for idx in oov_indices:
            self.emb_matrix_norm[idx] = 1.0 
Example #13
Source File: arora.py    From ParlAI with MIT License 5 votes vote down vote up
def get_glove_embs(self):
        """
        Loads torchtext GloVe embs from file and stores in self.tt_embs.
        """
        if not hasattr(self, 'glove_cache'):
            self.glove_cache = modelzoo_path(self.data_path, 'models:glove_vectors')
        print('Loading torchtext GloVe embs (for Arora sentence embs)...')
        self.tt_embs = vocab.GloVe(
            name=self.glove_name, dim=self.glove_dim, cache=self.glove_cache
        )
        print('Finished loading torchtext GloVe embs') 
Example #14
Source File: arora.py    From ParlAI with MIT License 5 votes vote down vote up
def __init__(self, word2prob, arora_a, glove_name, glove_dim, first_sv, data_path):
        """
          Inputs:
            word2prob: dict mapping words to their unigram probs
            arora_a: a float. Is the constant (called "a" in the paper)
              used to compute Arora sentence embeddings.
            glove_name: the version of GloVe to use, e.g. '840B'
            glove_dim: the dimension of the GloVe embeddings to use, e.g. 300
            first_sv: np array shape (glove_dim). The first singular value,
              used to compute Arora sentence embeddings. Can be None.
            data_path: The data path (we will use this to download glove)
        """
        self.word2prob = word2prob
        self.arora_a = arora_a
        self.glove_name = glove_name
        self.glove_dim = glove_dim
        self.first_sv = first_sv
        self.data_path = data_path
        if self.first_sv is not None:
            self.first_sv = torch.tensor(self.first_sv)  # convert to torch tensor

        self.min_word_prob = min(word2prob.values())  # prob of rarest word
        self.tt_embs = None  # will be torchtext.vocab.GloVe object
        self.emb_matrix = None  # will be np array shape (vocab_size, glove_dim)

        # Initialize a cache, which holds up to 64 sentences, along with their
        # corresponding word similarity scores (i.e. cosine sim for every word in the
        # vocab). This enables us to repeatedly retrieve sims for sentences we have
        # already processed (useful for batched beam search).
        self.cache_limit = 64
        self.cache_sent2sims = {}  # maps sent to sims. holds up to cache_limit.
        self.cache_sentqueue = deque()  # list of sents. add to right, remove from left 
Example #15
Source File: tc.py    From torchtest with GNU General Public License v3.0 5 votes vote down vote up
def load_data(batch_size=32):
  # define a tokenizer
  # tokenize = lambda s : nltk.word_tokenize(s)
  tokenize = lambda s : s.split()
  # fields : ( text_field, label_field )
  print(':: creating fields')
  text_field = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
  #text_field  = data.Field(sequential=True, tokenize=tokenize, lower=True)
  label_field = data.LabelField(sequential=False)
  # get IMDB data
  print(':: fetching IMDB data')
  train_data, test_data = datasets.IMDB.splits(text_field, label_field) 
  # build vocabulary for fields
  text_field.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
  label_field.build_vocab(train_data)

  # split train into train and valid
  train_data, valid_data = train_data.split() 

  print(':: labels :', label_field.vocab.stoi)

  # iterators
  train_iter, test_iter, valid_iter = data.BucketIterator.splits( 
                  (train_data, test_data, valid_data), 
                  batch_size=batch_size, 
                  sort_key=lambda x : len(x.text),
                  repeat=False,
                  shuffle=True)

  return  ( (text_field, label_field), (train_iter, test_iter, valid_iter), 
      text_field.vocab.vectors, # GloVe vectors 
      len(text_field.vocab)
        ) 
Example #16
Source File: arora.py    From ParlAI with MIT License 4 votes vote down vote up
def embed_sent(self, sent, rem_first_sv=True):
        """
        Produce a Arora-style sentence embedding for a given sentence.

        Inputs:
          sent: tokenized sentence; a list of strings
          rem_first_sv: If True, remove the first singular value when you compute the
            sentence embddings. Otherwise, don't remove it.
        Returns:
          sent_emb: tensor length glove_dim, or None.
              If sent_emb is None, that's because all of the words were OOV for GloVe.
        """
        # If we haven't loaded the torchtext GloVe embeddings, do so
        if self.tt_embs is None:
            self.get_glove_embs()

        # Lookup glove embeddings for words
        tokens = [t for t in sent if t in self.tt_embs.stoi]  # in-vocab tokens
        # glove_oov_tokens = [t for t in sent if t not in self.tt_embs.stoi]
        # if len(glove_oov_tokens)>0:
        #     print("WARNING: tokens OOV for glove: ", glove_oov_tokens)
        if len(tokens) == 0:
            print(
                'WARNING: tried to embed utterance %s but all tokens are OOV for '
                'GloVe. Returning embedding=None' % sent
            )
            return None
        word_embs = [
            self.tt_embs.vectors[self.tt_embs.stoi[t]] for t in tokens
        ]  # list of torch Tensors shape (glove_dim)

        # Get unigram probabilities for the words. If we don't have a word in word2prob,
        # assume it's as rare as the rarest word in word2prob.
        unigram_probs = [
            self.word2prob[t] if t in self.word2prob else self.min_word_prob
            for t in tokens
        ]  # list of floats
        # word2prob_oov_tokens = [t for t in tokens if t not in self.word2prob]
        # if len(word2prob_oov_tokens)>0:
        #     print('WARNING: tokens OOV for word2prob, so assuming they are '
        #           'maximally rare: ', word2prob_oov_tokens)

        # Calculate the weighted average of the word embeddings
        smooth_inverse_freqs = [
            self.arora_a / (self.arora_a + p) for p in unigram_probs
        ]  # list of floats
        sent_emb = sum(
            [word_emb * wt for (word_emb, wt) in zip(word_embs, smooth_inverse_freqs)]
        ) / len(
            word_embs
        )  # torch Tensor shape (glove_dim)

        # Remove the first singular value from sent_emb
        if rem_first_sv:
            sent_emb = remove_first_sv(sent_emb, self.first_sv)

        return sent_emb 
Example #17
Source File: arora.py    From neural_chat with MIT License 4 votes vote down vote up
def embed_sent(self, sent, rem_first_sv=True):
        """
        Produce a Arora-style sentence embedding for a given sentence.

        Inputs:
          sent: tokenized sentence; a list of strings
          rem_first_sv: If True, remove the first singular value when you compute the
            sentence embddings. Otherwise, don't remove it.
        Returns:
          sent_emb: tensor length glove_dim, or None.
              If sent_emb is None, that's because all of the words were OOV for GloVe.
        """
        # If we haven't loaded the torchtext GloVe embeddings, do so
        if self.tt_embs is None:
            self.get_glove_embs()

        # Lookup glove embeddings for words
        tokens = [t for t in sent if t in self.tt_embs.stoi]  # in-vocab tokens
        # glove_oov_tokens = [t for t in sent if t not in self.tt_embs.stoi]
        # if len(glove_oov_tokens)>0:
        #     print("WARNING: tokens OOV for glove: ", glove_oov_tokens)
        if len(tokens) == 0:
            print(
                'WARNING: tried to embed utterance %s but all tokens are OOV for '
                'GloVe. Returning embedding=None' % sent
            )
            return None
        word_embs = [
            self.tt_embs.vectors[self.tt_embs.stoi[t]] for t in tokens
        ]  # list of torch Tensors shape (glove_dim)

        # Get unigram probabilities for the words. If we don't have a word in word2prob,
        # assume it's as rare as the rarest word in word2prob.
        unigram_probs = [
            self.word2prob[t] if t in self.word2prob else self.min_word_prob
            for t in tokens
        ]  # list of floats
        # word2prob_oov_tokens = [t for t in tokens if t not in self.word2prob]
        # if len(word2prob_oov_tokens)>0:
        #     print('WARNING: tokens OOV for word2prob, so assuming they are '
        #           'maximally rare: ', word2prob_oov_tokens)

        # Calculate the weighted average of the word embeddings
        smooth_inverse_freqs = [
            self.arora_a / (self.arora_a + p) for p in unigram_probs
        ]  # list of floats
        sent_emb = sum(
            [word_emb * wt for (word_emb, wt) in zip(word_embs, smooth_inverse_freqs)]
        ) / len(
            word_embs
        )  # torch Tensor shape (glove_dim)

        # Remove the first singular value from sent_emb
        if rem_first_sv:
            sent_emb = remove_first_sv(sent_emb, self.first_sv)

        return sent_emb 
Example #18
Source File: torch_agent_v1.py    From neural_chat with MIT License 4 votes vote down vote up
def _get_embtype(self, emb_type):
        # set up preinitialized embeddings
        try:
            import torchtext.vocab as vocab
        except ImportError as ex:
            print('Please install torch text with `pip install torchtext`')
            raise ex
        pretrained_dim = 300
        if emb_type.startswith('glove'):
            if 'twitter' in emb_type:
                init = 'glove-twitter'
                name = 'twitter.27B'
                pretrained_dim = 200
            else:
                init = 'glove'
                name = '840B'
            embs = vocab.GloVe(
                name=name,
                dim=pretrained_dim,
                cache=modelzoo_path(self.opt.get('datapath'), 'models:glove_vectors'),
            )
        elif emb_type.startswith('fasttext_cc'):
            init = 'fasttext_cc'
            embs = vocab.FastText(
                language='en',
                cache=modelzoo_path(
                    self.opt.get('datapath'), 'models:fasttext_cc_vectors'
                ),
            )
        elif emb_type.startswith('fasttext'):
            init = 'fasttext'
            embs = vocab.FastText(
                language='en',
                cache=modelzoo_path(
                    self.opt.get('datapath'), 'models:fasttext_vectors'
                ),
            )
        else:
            raise RuntimeError(
                'embedding type {} not implemented. check arg, '
                'submit PR to this function, or override it.'
                ''.format(emb_type)
            )
        return embs, init