Python nltk.collocations.BigramCollocationFinder.from_words() Examples

The following are 13 code examples of nltk.collocations.BigramCollocationFinder.from_words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.collocations.BigramCollocationFinder , or try the search function .
Example #1
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 7 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; ")) 
Example #2
Source File: test_collocations.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def test_bigram2(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent)

        #python 2.6 does not have assertItemsEqual or assertListEqual
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted([(('a', 'a'), 1), (('a', 'test'), 1), (('is', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'is'), 1), (('this', 'this'), 1)])
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
        )
        self.assertTrue(len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1)
        self.assertTrue(close_enough(
            sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
            sorted([(('a', 'a'), 1.0), (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('is', 'is'), 1.0), (('test', 'test'), 1.0), (('this', 'is'), 1.0), (('this', 'this'), 1.0)])
        )) 
Example #3
Source File: test_collocations.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def test_bigram3(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent, window_size=3)
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted([(('a', 'test'), 3), (('is', 'a'), 3), (('this', 'is'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)])
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
        )
        self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0)
        self.assertTrue(close_enough(
            sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
            sorted([(('a', 'test'), 1.584962500721156), (('is', 'a'), 1.584962500721156), (('this', 'is'), 1.584962500721156), (('a', 'a'), 0.0), (('is', 'is'), 0.0), (('test', 'test'), 0.0), (('this', 'this'), 0.0)])
        )) 
Example #4
Source File: test_collocations.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def test_bigram5(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent, window_size=5)
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted([(('a', 'test'), 4), (('is', 'a'), 4), (('this', 'is'), 4), (('is', 'test'), 3), (('this', 'a'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)])
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)])
        )
        self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0)
        self.assertTrue(close_enough(
            sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
            sorted([(('a', 'test'), 1.0), (('is', 'a'), 1.0), (('this', 'is'), 1.0), (('is', 'test'), 0.5849625007211562), (('this', 'a'), 0.5849625007211562), (('a', 'a'), -1.0), (('is', 'is'), -1.0), (('test', 'test'), -1.0), (('this', 'this'), -1.0)])
        )) 
Example #5
Source File: eval_utils.py    From tf-var-attention with MIT License 6 votes vote down vote up
def calculate_ngram_diversity(corpus):
    """
    Calculates unigram and bigram diversity

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score

    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity 
Example #6
Source File: text.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations]
        print tokenwrap(colloc_strings, separator="; ") 
Example #7
Source File: metric.py    From MultiTurnDialogZoo with MIT License 6 votes vote down vote up
def cal_Distinct(corpus):
    """
    Calculates unigram and bigram diversity
    Args:
        corpus: tokenized list of sentences sampled
    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score
    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity 
Example #8
Source File: load_samples.py    From yenlp with GNU General Public License v3.0 5 votes vote down vote up
def bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Find the best n bigrams of a text by means of a give measure.'''
    words = tokenize(text)
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)]) 
Example #9
Source File: load_samples.py    From yenlp with GNU General Public License v3.0 5 votes vote down vote up
def stopword_filtered_bigrams(text, score_fn=BigramAssocMeasures.chi_sq, n=500):
    '''Removes the stopwords and computes the best bigrams'''
    stopset = set(stopwords.words('english'))
    words = [word for word in tokenize(text) if word not in stopset]
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)]) 
Example #10
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def collocation_list(self, num=20, window_size=2):
        """
        Return collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to return.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            "_collocations" in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words("english")
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        return [w1 + " " + w2 for w1, w2 in self._collocations] 
Example #11
Source File: test_collocations.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def test_bigram2(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent)

        # python 2.6 does not have assertItemsEqual or assertListEqual
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted(
                [
                    (('a', 'a'), 1),
                    (('a', 'test'), 1),
                    (('is', 'a'), 1),
                    (('is', 'is'), 1),
                    (('test', 'test'), 1),
                    (('this', 'is'), 1),
                    (('this', 'this'), 1),
                ]
            ),
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
        )
        self.assertTrue(
            len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1
        )
        self.assertTrue(
            close_enough(
                sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
                sorted(
                    [
                        (('a', 'a'), 1.0),
                        (('a', 'test'), 1.0),
                        (('is', 'a'), 1.0),
                        (('is', 'is'), 1.0),
                        (('test', 'test'), 1.0),
                        (('this', 'is'), 1.0),
                        (('this', 'this'), 1.0),
                    ]
                ),
            )
        ) 
Example #12
Source File: test_collocations.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def test_bigram3(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent, window_size=3)
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted(
                [
                    (('a', 'test'), 3),
                    (('is', 'a'), 3),
                    (('this', 'is'), 3),
                    (('a', 'a'), 1),
                    (('is', 'is'), 1),
                    (('test', 'test'), 1),
                    (('this', 'this'), 1),
                ]
            ),
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
        )
        self.assertTrue(
            len(sent)
            == sum(b.word_fd.values())
            == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0
        )
        self.assertTrue(
            close_enough(
                sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
                sorted(
                    [
                        (('a', 'test'), 1.584962500721156),
                        (('is', 'a'), 1.584962500721156),
                        (('this', 'is'), 1.584962500721156),
                        (('a', 'a'), 0.0),
                        (('is', 'is'), 0.0),
                        (('test', 'test'), 0.0),
                        (('this', 'this'), 0.0),
                    ]
                ),
            )
        ) 
Example #13
Source File: test_collocations.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def test_bigram5(self):
        sent = 'this this is is a a test test'.split()

        b = BigramCollocationFinder.from_words(sent, window_size=5)
        self.assertEqual(
            sorted(b.ngram_fd.items()),
            sorted(
                [
                    (('a', 'test'), 4),
                    (('is', 'a'), 4),
                    (('this', 'is'), 4),
                    (('is', 'test'), 3),
                    (('this', 'a'), 3),
                    (('a', 'a'), 1),
                    (('is', 'is'), 1),
                    (('test', 'test'), 1),
                    (('this', 'this'), 1),
                ]
            ),
        )
        self.assertEqual(
            sorted(b.word_fd.items()),
            sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]),
        )
        self.assertTrue(
            len(sent)
            == sum(b.word_fd.values())
            == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0
        )
        self.assertTrue(
            close_enough(
                sorted(b.score_ngrams(BigramAssocMeasures.pmi)),
                sorted(
                    [
                        (('a', 'test'), 1.0),
                        (('is', 'a'), 1.0),
                        (('this', 'is'), 1.0),
                        (('is', 'test'), 0.5849625007211562),
                        (('this', 'a'), 0.5849625007211562),
                        (('a', 'a'), -1.0),
                        (('is', 'is'), -1.0),
                        (('test', 'test'), -1.0),
                        (('this', 'this'), -1.0),
                    ]
                ),
            )
        )