Python nltk.collocations() Examples

The following are code examples for showing how to use nltk.collocations(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: chowmein   Author: xiaohan2012   File: label_finder.py    MIT License 6 votes vote down vote up
def __init__(self, measure='pmi',
                 min_freq=10,
                 pos=[('NN', 'NN'), ('JJ', 'NN')]):
        """
        measure: str
            the measurement method, 'pmi'or 'chi_sq'

        min_freq: int
            minimal frequency for the label to be considered

        pos: list of (str, str)
            the POS tag contraint
        """
        self.bigram_measures = nltk.collocations.BigramAssocMeasures()
        assert measure in ('pmi', 'chi_sq')
        self._measure_method = measure

        self._min_freq = min_freq
        self._pos = pos 
Example 2
Project: texttk   Author: fmpr   File: texttk.py    GNU General Public License v3.0 6 votes vote down vote up
def find_nbest_bigrams(self, corpus, n, metric, min_freq):
		print "finding top-%d bigrams using %s..." % (n, metric)
		alltokens = []
		simplerTokenizer = CustomTokenizer(self.base_tokenizer, lambda x: x, re.compile(".*"), re.compile("^$"))
		for doc in corpus:
			for token in [t for t in simplerTokenizer(doc)]:
				alltokens.append(token)
		bigram_measures = nltk.collocations.BigramAssocMeasures()
		finder = BigramCollocationFinder.from_words(alltokens)
		finder.apply_freq_filter(min_freq) # bigrams must appear at least 5 times
		if metric.lower() == "pmi":
			best_bigrams = finder.nbest(bigram_measures.pmi, n)  # doctest: +NORMALIZE_WHITESPACE
		elif metric.lower() == "chi_sq":
			best_bigrams = finder.nbest(bigram_measures.chi_sq, n)  # doctest: +NORMALIZE_WHITESPACE
		else:
			raise Exception("Unknown metric for bigram finder")
		return best_bigrams 
Example 3
Project: Machine-Learning-Algorithm-for-Voice-Analysis   Author: Shahabks   File: ProbabilityLANGwithTEXTANALYSIS0test3-0.py    GNU General Public License v3.0 6 votes vote down vote up
def bigram_collocation_feats(
        self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi
    ):
        """
        Return `top_n` bigram features (using `assoc_measure`).
        Note that this method is based on bigram collocations measures, and not
        on simple bigram frequency.

        :param documents: a list (or iterable) of tokens.
        :param top_n: number of best words/tokens to use, sorted by association
            measure.
        :param assoc_measure: bigram association measure to use as score function.
        :param min_freq: the minimum number of occurrencies of bigrams to take
            into consideration.

        :return: `top_n` ngrams scored by the given association measure.
        """
        finder = BigramCollocationFinder.from_documents(documents)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(assoc_measure, top_n) 
Example 4
Project: DebateAnalysis   Author: Lingistic   File: ngrams.py    GNU General Public License v3.0 5 votes vote down vote up
def get_bigram_likelihood(statements, freq_filter=3, nbest=200):
    """
    Returns n (likelihood ratio) bi-grams from a group of documents
    :param        statements: list of strings
    :param output_file: output path for saved file
    :param freq_filter: filter for # of appearances in bi-gram
    :param       nbest: likelihood ratio for bi-grams
    """

    words = list()
    print 'Generating word list...'
    #tokenize sentence into words
    for statement in statements:
        # remove non-words
        tokenizer = RegexpTokenizer(r'\w+')
        words.extend(tokenizer.tokenize(statement))

    bigram_measures = nltk.collocations.BigramAssocMeasures()
    bigram_finder = BigramCollocationFinder.from_words(words)

    # only bi-grams that appear n+ times
    bigram_finder.apply_freq_filter(freq_filter)

    # TODO: use custom stop words
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in nltk.corpus.stopwords.words('english'))

    bigram_results = bigram_finder.nbest(bigram_measures.likelihood_ratio, nbest)

    return bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)