Python gensim.models.Phrases() Examples

The following are 8 code examples of gensim.models.Phrases(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models , or try the search function .
Example #1
Source File: Word2VecFromParsedCorpus.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def add_phrases(self, corpus):
		'''
		Parameters
		----------
		corpus: Corpus for phrase augmentation

		Returns
		-------
		New ParsedCorpus containing unigrams in corpus and new phrases
		'''
		from gensim.models import Phrases

		assert isinstance(corpus, ParsedCorpus)
		self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]

		for i in range(1, self.max_tokens_per_phrase):
			self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))

		return self 
Example #2
Source File: phrases.py    From broca with MIT License 6 votes vote down vote up
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {
        'max_vocab_size': 40000000,
        'threshold': 8.
    }.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)

    print('Saving...')
    bigram.save(out) 
Example #3
Source File: Word2VecFromParsedCorpus.py    From scattertext with Apache License 2.0 5 votes vote down vote up
def __init__(self, phrases, gram_size):
		'''
		Parameters
		----------
		phrases : list[gensim.models.Phrases]
		gram_size : int, maximum number of words per phrase
		kwargs : parameters for FeatsFromSpacyDoc.init
		'''
		from gensim.models import Phrases

		phrases = phrases
		gram_size = gram_size
		assert type(phrases) == Phrases
		self.gram_size = gram_size
		self.phrases = phrases 
Example #4
Source File: Word2VecFromParsedCorpus.py    From scattertext with Apache License 2.0 5 votes vote down vote up
def _scan_and_build_vocab(self):
		from gensim.models import Phrases
		bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
		try:
			self.model.scan_vocab(CorpusAdapterForGensim.get_sentences(self.corpus))
		except:
			pass
		self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)]) 
Example #5
Source File: train_vectors.py    From Blackstone with Apache License 2.0 5 votes vote down vote up
def compute_vectors(input_path: Path, output_path: Path):
    """
    Builds word embeddings using gensim Word2Vec. This function takes
    a file contained single sentences per line and writes the computed
    vectors in text format to the specified output path. 
    """
    print(f"Processing {input_path}")
    sentences = LineSentence(input_path)
    bigram_transformer = Phrases(sentences)
    model = Word2Vec(
        bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4
    )
    print(f"Saving vectors to {output_path}")
    model.wv.save_word2vec_format(output_path, binary=False) 
Example #6
Source File: overkill.py    From broca with MIT License 5 votes vote down vote up
def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]] 
Example #7
Source File: cf.py    From Seq2Seq_Upgrade_TensorFlow with Apache License 2.0 5 votes vote down vote up
def quad_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing tri gram"
    trigram = Phrases((list(bigram[tokenized_sentences_tokenized_words])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    print "performing quad gram"
    quadgram = Phrases((list(trigram[list(bigram[tokenized_sentences_tokenized_words])])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    quadgramprocessed = (quadgram[list(trigram[list(bigram[tokenized_sentences_tokenized_words])])])
    return quadgramprocessed 
Example #8
Source File: cf.py    From Seq2Seq_Upgrade_TensorFlow with Apache License 2.0 5 votes vote down vote up
def bi_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization):
    print "performing bi gram"
    bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10)
    bigramprocessed = (bigram[tokenized_sentences_tokenized_words])
    return bigramprocessed