Python nltk.util.ngrams() Examples

The following are code examples for showing how to use nltk.util.ngrams(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 2
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 3
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 4
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 5
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 6
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 7
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 8
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 9
Project: Health-Checker   Author: KriAga   File: bleu_score.py    MIT License 6 votes vote down vote up
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
        Gao and He (2013) Training MRF-Based Phrase Translation Models using
        Gradient Ascent. In NAACL.
        """
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0, 1]:  # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i + 1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n 
Example 10
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    Apache License 2.0 6 votes vote down vote up
def classify_text(text, classifier, certainity, g, unicodeFlag):
	#1. process text
	if unicodeFlag: text = text.decode('utf-8')
	word_list = process_text(text, removePunct=True, removeSW=False, removeNum=False)

	#2. generate ngrams
	mygrams = generate_ngrams(word_list, g)

	#3. generate features from ngrams
	feats = generate_features(mygrams)

	#4. classify
	probs = classifier.prob_classify(feats)
	label = probs.max()
	if probs.prob(label) >= certainity: return label, probs.prob(label)
	else: return 'none', probs.prob(label)

###################################################################################
# generates n-gram (g = num of grams)
# for example, if g=3, then the fuction will generate unigrams, bigrams, and tri-grams from the text. 
Example 11
Project: NMT-RDPG   Author: MultiPath   File: bleu.py    MIT License 6 votes vote down vote up
def method6(self, p_n, references, hypothesis, hyp_len):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2.
        """
        for i, p_i in enumerate(p_n):
            if i in [1, 2]:  # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i + 1))
                p_n[i] = (p_i + self.alpha * pi0) / (l + self.alpha)
        return p_n 
Example 12
Project: english-article-correction   Author: yaricom   File: nltk_ngram.py    MIT License 6 votes vote down vote up
def train_counts(self, training_text):
        # Note here "1" indicates an empty vocabulary!
        # See NgramModelVocabulary __len__ method for more.
        if len(self.vocabulary) <= 1:
            raise EmptyVocabularyError("Cannot start counting ngrams until "
                                       "vocabulary contains more than one item.")

        for sent in training_text:
            checked_sent = (self.check_against_vocab(word) for word in sent)
            sent_start = True
            for ngram in self.to_ngrams(checked_sent):
                context, word = tuple(ngram[:-1]), ngram[-1]

                if sent_start:
                    for context_word in context:
                        self.unigrams[context_word] += 1
                    sent_start = False

                for trunc_index, ngram_order in self._enumerate_ngram_orders():
                    trunc_context = context[trunc_index:]
                    # note that above line doesn't affect context on first iteration
                    self.ngrams[ngram_order][trunc_context][word] += 1
                self.unigrams[word] += 1 
Example 13
Project: align-linguistic-alignment   Author: nickduran   File: calculate_alignment.py    MIT License 6 votes vote down vote up
def ngram_lexical(sequence1,sequence2,ngramsize=2):
    """
    Create ngrams of the desired size for each of two
    interlocutors' sequences and return a dictionary
    of counts of ngrams for each sequence.

    By default, consider bigrams. If desired, this may be
    changed by setting `ngramsize` to the appropriate
    value.
    """

    # generate ngrams
    sequence1 = list(ngrams(sequence1,ngramsize))
    sequence2 = list(ngrams(sequence2,ngramsize))

    # join for counters
    new_sequence1 = [' '.join(pair) for pair in sequence1]
    new_sequence2 = [' '.join(pair) for pair in sequence2]

    # return counters
    return Counter(new_sequence1), Counter(new_sequence2) 
Example 14
Project: NQG   Author: magic282   File: nltk_bleu_score.py    GNU General Public License v3.0 6 votes vote down vote up
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
        Gao and He (2013) Training MRF-Based Phrase Translation Models using
        Gradient Ascent. In NAACL.
        """
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0,1]: # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i+1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n 
Example 15
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example 16
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example 17
Project: seq2seq-keyphrase-pytorch   Author: memray   File: bleu_score(3.2).py    Apache License 2.0 6 votes vote down vote up
def method6(self, p_n, references, hypothesis, hyp_len, *args, **kwargs):
        """
        Smoothing method 6:
        Interpolates the maximum likelihood estimate of the precision *p_n* with
        a prior estimate *pi0*. The prior is estimated by assuming that the ratio
        between pn and pn−1 will be the same as that between pn−1 and pn−2; from
        Gao and He (2013) Training MRF-Based Phrase Translation Models using
        Gradient Ascent. In NAACL.
        """
        # This smoothing only works when p_1 and p_2 is non-zero.
        # Raise an error with an appropriate message when the input is too short
        # to use this smoothing technique.
        assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
        for i, p_i in enumerate(p_n):
            if i in [0,1]: # Skips the first 2 orders of ngrams.
                continue
            else:
                pi0 = 0 if p_n[i-2] == 0 else p_n[i-1]**2 / p_n[i-2]
                # No. of ngrams in translation that matches the reference.
                m = p_i.numerator
                # No. of ngrams in translation.
                l = sum(1 for _ in ngrams(hypothesis, i+1))
                # Calculates the interpolated precision.
                p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
        return p_n 
Example 18
Project: GCN-SeA   Author: sumanbanerjee1   File: nlp.py    Apache License 2.0 5 votes vote down vote up
def sentence_bleu_4(hyp, refs, weights=[0.25, 0.25, 0.25, 0.25]):
    # input : single sentence, multiple references
    count = [0, 0, 0, 0]
    clip_count = [0, 0, 0, 0]
    r = 0
    c = 0

    for i in range(4):
        hypcnts = Counter(ngrams(hyp, i + 1))
        cnt = sum(hypcnts.values())
        count[i] += cnt

        # compute clipped counts
        max_counts = {}
        for ref in refs:
            refcnts = Counter(ngrams(ref, i + 1))
            for ng in hypcnts:
                max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
        clipcnt = dict((ng, min(count, max_counts[ng])) \
                       for ng, count in hypcnts.items())
        clip_count[i] += sum(clipcnt.values())

    bestmatch = [1000, 1000]
    for ref in refs:
        if bestmatch[0] == 0:
            break
        diff = abs(len(ref) - len(hyp))
        if diff < bestmatch[0]:
            bestmatch[0] = diff
            bestmatch[1] = len(ref)
    r = bestmatch[1]
    c = len(hyp)

    p0 = 1e-7
    bp = math.exp(-abs(1.0 - float(r) / float(c + p0)))

    p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
    s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
    bleu_hyp = bp * math.exp(s)

    return bleu_hyp 
Example 19
Project: Pesquisas   Author: danilopcarlotti   File: statistical_analysis.py    Apache License 2.0 5 votes vote down vote up
def ngramFreq(self,text,n):
		''' Returns a Counter object with the frequency of each ngram in the text '''
		token = nltk.word_tokenize(text)
		Ngram = ngrams(token,n)
		return Counter(Ngram) 
Example 20
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example 21
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
Example 22
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
Example 23
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
Example 24
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
Example 25
Project: razzy-spinner   Author: rafasashi   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def from_words(cls, words, window_size=4):
        if window_size < 4:
            raise ValueError("Specify window_size at least 4")
        ixxx = FreqDist()
        iiii = FreqDist()
        ii = FreqDist()
        iii = FreqDist()
        ixi = FreqDist()
        ixxi = FreqDist()
        iixi = FreqDist()
        ixii = FreqDist()

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                ixxx[w1] += 1
                if w2 is None:
                    continue
                ii[(w1, w2)] += 1
                if w3 is None:
                    continue
                iii[(w1, w2, w3)] += 1
                ixi[(w1, w3)] += 1
                if w4 is None:
                    continue
                iiii[(w1, w2, w3, w4)] += 1
                ixxi[(w1, w4)] += 1
                ixii[(w1, w3, w4)] += 1
                iixi[(w1, w2, w4)] += 1

        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) 
Example 26
Project: graph-keyword-search   Author: usc-isi-i2   File: ngramsEngine.py    Apache License 2.0 5 votes vote down vote up
def generateNGrams(self,query):

		# This stores the n-grams as generated by NLTK
		ngramsNLTKList = []
		
		# Get the inital n-gram list built
		for n in range(len(query),0,-1):
			ngramsNLTKList.extend(ngrams(query.split(),n))

		# Actual n-gram list (List 1 as in the description)
		ngramList = []

		# A look up list (List 2 as in the description)
		lookupList = []
		
		# Join the individual lists to get the n-grams
		for ngram in ngramsNLTKList:
			ngramList.append((' '.join(ngram)).strip())

		# Determine the length of the lookupList required
		if(len(ngramsNLTKList)>0):
			maxLength = len(ngramsNLTKList[0])
			for i in range(maxLength):
				lookupList.append([])

		# Fill in the lookupList
		# All n-grams of length 1 goes in index 0
		# All n-grams of length 2 goes in index 1
		# All n-grams of length 3 goes in index 2 ...
		for token in ngramsNLTKList:
			joinedToken = ' '.join(token).strip()
			listLength = len(joinedToken)
			currentList = lookupList[len(token)-1]
			currentList.append(' '.join(token))

		return ngramList,lookupList 
Example 27
Project: ConvLab   Author: ConvLab   File: nlp.py    MIT License 5 votes vote down vote up
def sentence_bleu_4(hyp, refs, weights=[0.25, 0.25, 0.25, 0.25]):
    # input : single sentence, multiple references
    count = [0, 0, 0, 0]
    clip_count = [0, 0, 0, 0]
    r = 0
    c = 0

    for i in range(4):
        hypcnts = Counter(ngrams(hyp, i + 1))
        cnt = sum(hypcnts.values())
        count[i] += cnt

        # compute clipped counts
        max_counts = {}
        for ref in refs:
            refcnts = Counter(ngrams(ref, i + 1))
            for ng in hypcnts:
                max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
        clipcnt = dict((ng, min(count, max_counts[ng])) \
                       for ng, count in hypcnts.items())
        clip_count[i] += sum(clipcnt.values())

    bestmatch = [1000, 1000]
    for ref in refs:
        if bestmatch[0] == 0:
            break
        diff = abs(len(ref) - len(hyp))
        if diff < bestmatch[0]:
            bestmatch[0] = diff
            bestmatch[1] = len(ref)
    r = bestmatch[1]
    c = len(hyp)

    p0 = 1e-7
    bp = math.exp(-abs(1.0 - float(r) / float(c + p0)))

    p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
    s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
    bleu_hyp = bp * math.exp(s)

    return bleu_hyp 
Example 28
Project: twitter_analysis   Author: urmilkadakia   File: analysis_methods.py    Apache License 2.0 5 votes vote down vote up
def _get_ngram_list(text, n=1, alpha_numeric_flag=False, stop_words_flag=False):
    """
    Returns the a list of ngram for the input text for the specified ngram type
    :param text: input text
    :param n: n represents the n in n-gram which is a contiguous sequence of n items. The default vale is 1 which
              represents unigram.
    :param alpha_numeric_flag: filter all non alpha numeric words. Default is false.
    :param stop_words_flag: filter all stop words. Default is false.
    :return: Return a list of ngrams
    """

    # Get rid of punctuation (except periods!)
    punctuation_no_period = "[" + re.sub(r"\.", "", string.punctuation) + "]"
    text = re.sub(punctuation_no_period, "", text.lower())

    # Splits the sentences into words
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tknzr.tokenize(text)

    # remove remaining tokens that are not alphabetic
    # Problem !!! also removes emoji joiners and similar tokens
    if alpha_numeric_flag:
        tokens = [token for token in tokens if token.isalnum()]

    # filter out stop words
    if stop_words_flag:
        tokens = [w for w in tokens if w not in STOP_WORDS]

    ngram_list = list(ngrams(tokens, n))
    return ngram_list 
Example 29
Project: twitter_analysis   Author: urmilkadakia   File: analysis_methods.py    Apache License 2.0 5 votes vote down vote up
def ngram_histogram(input_file, output_file, n=1, cutoff_freq=5, alpha_numeric_flag=0, stop_words_flag=0):
    """
    The function to plot and store the histogram of the specified ngram and their frequencies for the ngrams which has
    frequency greater than cutoff_freq
    :param input_file: Path to input file
    :param output_file: Path to output file
    :param n: n represents the n in n-gram which is a contiguous sequence of n items. The default vale is 1 which
              represents unigram.
    :param cutoff_freq: The ngrams that has less frequency than the cut off frequency will not be included in the
                        output file.  The default value is 5.
    :param alpha_numeric_flag: filter all non alpha numeric words. Default is false.
    :param stop_words_flag: filter all stop words. Default is false.
    """
    ngram_freq = count_ngram_frequency(input_file, n, alpha_numeric_flag, stop_words_flag)
    ngram_freq = ngram_freq.most_common()

    xdata = []
    ydata = []

    for x, y in ngram_freq:
        if y < cutoff_freq:
            break

        # if not any(elem in x for elem in stop_words):
        # Checking the ngram is unigram or not
        if n == 1:
            xdata.append(x[0])
        else:
            xdata.append(str(x))
        ydata.append(y)

    # Plotting the ngrams of the given file
    plt.bar(xdata, ydata)
    plt.xlabel('Ngrams', fontsize=12)
    plt.xticks(xdata, xdata, rotation=80)
    plt.ylabel('Frequency', fontsize=12)
    plt.title('Ngram frequency distribution ', fontsize=14)
    plt.gcf().subplots_adjust(bottom=0.45)
    plt.savefig(output_file) 
Example 30
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example 31
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
Example 32
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
Example 33
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
Example 34
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
Example 35
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def from_words(cls, words, window_size=4):
        if window_size < 4:
            raise ValueError("Specify window_size at least 4")
        ixxx = FreqDist()
        iiii = FreqDist()
        ii = FreqDist()
        iii = FreqDist()
        ixi = FreqDist()
        ixxi = FreqDist()
        iixi = FreqDist()
        ixii = FreqDist()

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                ixxx[w1] += 1
                if w2 is None:
                    continue
                ii[(w1, w2)] += 1
                if w3 is None:
                    continue
                iii[(w1, w2, w3)] += 1
                ixi[(w1, w3)] += 1
                if w4 is None:
                    continue
                iiii[(w1, w2, w3, w4)] += 1
                ixxi[(w1, w4)] += 1
                ixii[(w1, w3, w4)] += 1
                iixi[(w1, w2, w4)] += 1

        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) 
Example 36
Project: OpenBottle   Author: xiaozhuchacha   File: ribes_score.py    MIT License 5 votes vote down vote up
def position_of_ngram(ngram, sentence):
    """
    This function returns the position of the first instance of the ngram 
    appearing in a sentence.
    
    Note that one could also use string as follows but the code is a little
    convoluted with type casting back and forth:
        
        char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
        word_pos = char_pos.count(' ')
        
    Another way to conceive this is:
    
        return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) 
                    if ng == ngram)
                    
    :param ngram: The ngram that needs to be searched
    :type ngram: tuple
    :param sentence: The list of tokens to search from.
    :type sentence: list(str)
    """
    # Iterates through the ngrams in sentence.
    for i,sublist in enumerate(ngrams(sentence, len(ngram))):
        # Returns the index of the word when ngram matches.
        if ngram == sublist:
            return i 
Example 37
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example 38
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
Example 39
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
Example 40
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
Example 41
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
Example 42
Project: OpenBottle   Author: xiaozhuchacha   File: collocations.py    MIT License 5 votes vote down vote up
def from_words(cls, words, window_size=4):
        if window_size < 4:
            raise ValueError("Specify window_size at least 4")
        ixxx = FreqDist()
        iiii = FreqDist()
        ii = FreqDist()
        iii = FreqDist()
        ixi = FreqDist()
        ixxi = FreqDist()
        iixi = FreqDist()
        ixii = FreqDist()

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                ixxx[w1] += 1
                if w2 is None:
                    continue
                ii[(w1, w2)] += 1
                if w3 is None:
                    continue
                iii[(w1, w2, w3)] += 1
                ixi[(w1, w3)] += 1
                if w4 is None:
                    continue
                iiii[(w1, w2, w3, w4)] += 1
                ixxi[(w1, w4)] += 1
                ixii[(w1, w3, w4)] += 1
                iixi[(w1, w2, w4)] += 1

        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) 
Example 43
Project: OpenBottle   Author: xiaozhuchacha   File: ribes_score.py    MIT License 5 votes vote down vote up
def position_of_ngram(ngram, sentence):
    """
    This function returns the position of the first instance of the ngram 
    appearing in a sentence.
    
    Note that one could also use string as follows but the code is a little
    convoluted with type casting back and forth:
        
        char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
        word_pos = char_pos.count(' ')
        
    Another way to conceive this is:
    
        return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) 
                    if ng == ngram)
                    
    :param ngram: The ngram that needs to be searched
    :type ngram: tuple
    :param sentence: The list of tokens to search from.
    :type sentence: list(str)
    """
    # Iterates through the ngrams in sentence.
    for i,sublist in enumerate(ngrams(sentence, len(ngram))):
        # Returns the index of the word when ngram matches.
        if ngram == sublist:
            return i 
Example 44
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example 45
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
Example 46
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
Example 47
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
Example 48
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
Example 49
Project: Health-Checker   Author: KriAga   File: collocations.py    MIT License 5 votes vote down vote up
def from_words(cls, words, window_size=4):
        if window_size < 4:
            raise ValueError("Specify window_size at least 4")
        ixxx = FreqDist()
        iiii = FreqDist()
        ii = FreqDist()
        iii = FreqDist()
        ixi = FreqDist()
        ixxi = FreqDist()
        iixi = FreqDist()
        ixii = FreqDist()

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                ixxx[w1] += 1
                if w2 is None:
                    continue
                ii[(w1, w2)] += 1
                if w3 is None:
                    continue
                iii[(w1, w2, w3)] += 1
                ixi[(w1, w3)] += 1
                if w4 is None:
                    continue
                iiii[(w1, w2, w3, w4)] += 1
                ixxi[(w1, w4)] += 1
                ixii[(w1, w3, w4)] += 1
                iixi[(w1, w2, w4)] += 1

        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) 
Example 50
Project: Health-Checker   Author: KriAga   File: ribes_score.py    MIT License 5 votes vote down vote up
def position_of_ngram(ngram, sentence):
    """
    This function returns the position of the first instance of the ngram 
    appearing in a sentence.
    
    Note that one could also use string as follows but the code is a little
    convoluted with type casting back and forth:
        
        char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))]
        word_pos = char_pos.count(' ')
        
    Another way to conceive this is:
    
        return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) 
                    if ng == ngram)
                    
    :param ngram: The ngram that needs to be searched
    :type ngram: tuple
    :param sentence: The list of tokens to search from.
    :type sentence: list(str)
    """
    # Iterates through the ngrams in sentence.
    for i,sublist in enumerate(ngrams(sentence, len(ngram))):
        # Returns the index of the word when ngram matches.
        if ngram == sublist:
            return i 
Example 51
Project: l3wtransformer   Author: GeorgesAlkhouri   File: l3wtransformer.py    MIT License 5 votes vote down vote up
def word_to_ngrams(self, word):
        """Returns a list of all n-gram possibilities of the given word."""
        if self.lower:
            word = word.lower()
        word = '<' + word + '>'
        return list(map(lambda x: ''.join(x), list(ngrams(word, self.ngram_size)))) 
Example 52
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    Apache License 2.0 5 votes vote down vote up
def generate_ngrams(word_list, g):
	mygrams = []
	unigrams = [word for word in word_list]
	mygrams += unigrams
	for i in range(2,g+1): mygrams += ngrams(word_list, i)
	return mygrams
###################################################################################

# generate n-gram features in the form (n-gram, True), i.e., binary feature. In other words, the n-gram exists 
Example 53
Project: fc-aaai18   Author: thanhan   File: transforms.py    MIT License 5 votes vote down vote up
def _get_bigram_clusters(s, bc_data):
    clusters = filter(None, [bc_data.get(l) for l in get_tokenized_lemmas(s)])
    return ngrams(clusters, 2) 
Example 54
Project: english-article-correction   Author: yaricom   File: nltk_ngram.py    MIT License 5 votes vote down vote up
def __init__(self, order, vocabulary, unk_cutoff=None, unk_label="<UNK>", **ngrams_kwargs):
        """
        :type training_text: List[List[str]]
        """

        if order < 1:
            message = "Order of NgramCounter cannot be less than 1. Got: {0}"
            raise ValueError(message.format(order))

        self.order = order
        self.unk_label = unk_label

        # Preset some common defaults...
        self.ngrams_kwargs = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }
        # While allowing whatever the user passes to override them
        self.ngrams_kwargs.update(ngrams_kwargs)
        # Set up the vocabulary
        self._set_up_vocabulary(vocabulary, unk_cutoff)

        self.ngrams = defaultdict(ConditionalFreqDist)
        self.unigrams = FreqDist() 
Example 55
Project: english-article-correction   Author: yaricom   File: nltk_ngram.py    MIT License 5 votes vote down vote up
def to_ngrams(self, sequence):
        """Wrapper around util.ngrams with usefull options saved during initialization.
        :param sequence: same as nltk.util.ngrams
        :type sequence: any iterable
        """
        return ngrams(sequence, self.order, **self.ngrams_kwargs) 
Example 56
Project: english-article-correction   Author: yaricom   File: nltk_ngram.py    MIT License 5 votes vote down vote up
def __init__(self, ngram_counter):

        self.ngram_counter = ngram_counter
        # for convenient access save top-most ngram order ConditionalFreqDist
        self.ngrams = ngram_counter.ngrams[ngram_counter.order]
        self._ngrams = ngram_counter.ngrams
        self._order = ngram_counter.order

        self._check_against_vocab = self.ngram_counter.check_against_vocab 
Example 57
Project: english-article-correction   Author: yaricom   File: nltk_ngram.py    MIT License 5 votes vote down vote up
def score(self, word, context):
        """Returns the MLE score for a word given a context.
        Args:
        - word is expcected to be a string
        - context is expected to be something reasonably convertible to a tuple
        """
        context = self.check_context(context)
        return self.ngrams[context].freq(word) 
Example 58
Project: align-linguistic-alignment   Author: nickduran   File: calculate_alignment.py    MIT License 5 votes vote down vote up
def ngram_pos(sequence1,sequence2,ngramsize=2,
                   ignore_duplicates=True):
    """
    Remove mimicked lexical sequences from two interlocutors'
    sequences and return a dictionary of counts of ngrams
    of the desired size for each sequence.

    By default, consider bigrams. If desired, this may be
    changed by setting `ngramsize` to the appropriate
    value.

    By default, ignore duplicate lexical n-grams when
    processing these sequences. If desired, this may
    be changed with `ignore_duplicates=False`.
    """

    # remove duplicates and recreate sequences
    sequence1 = set(ngrams(sequence1,ngramsize))
    sequence2 = set(ngrams(sequence2,ngramsize))

    # if desired, remove duplicates from sequences
    if ignore_duplicates:
        new_sequence1 = [tuple([''.join(pair[1]) for pair in tup]) for tup in list(sequence1 - sequence2)]
        new_sequence2 = [tuple([''.join(pair[1]) for pair in tup]) for tup in list(sequence2 - sequence1)]
    else:
        new_sequence1 = [tuple([''.join(pair[1]) for pair in tup]) for tup in sequence1]
        new_sequence2 = [tuple([''.join(pair[1]) for pair in tup]) for tup in sequence2]

    # return counters
    return Counter(new_sequence1), Counter(new_sequence2) 
Example 59
Project: essay-rater-web   Author: alexandrablooky   File: essay_feature.py    MIT License 5 votes vote down vote up
def word_grams(self, words, min=1, max=5):
        """
        To compute n-grams of the input words from min-gram to (max-1)-gram
        :param words: A list of words in sequence from a sentence
        :param min: the minimum number of grams to be calculated is min
        :param max: the maximum number of grams to be calculated in max - 1
        :return: a list of n-grams from min-gram to (max-1) gram
        """
        s = []
        for n in range(min, max):
            for ngram in ngrams(words, n):
                s.append(' '.join(str(i) for i in ngram))
        return s

    # def __essay2paragraph(self, essay):
    #     paragraphs = essay.split('\r\n')
    #     new_paragraphs = []
    #     for paragraph in paragraphs:
    #         # essay to sentences with punctuations removed and abbreviations recovered
    #         paragraph = paragraph.replace('\t', ' ')
    #         paragraph = paragraph.replace('\\s+', ' ')
    #         paragraph = self.__recoverAbbreviation(paragraph)
    #         paragraph2words = word_tokenize(paragraph, preserve_line=True)
    #         if len(paragraph2words) >= 5:
    #             new_paragraphs.append(paragraph2words)

    #     return new_paragraphs 
Example 60
Project: essay-rater-web   Author: alexandrablooky   File: sent_chooser.py    MIT License 5 votes vote down vote up
def getWordNgram(self, words):
        word_ngram = []
        for n in range(1,5):
            for ngram in ngrams(words,n):
                ngram_list = [gram.lower() for gram in list(ngram)]
                # time_logger.info('ii:' + str(ngram_list))
                word_ngram.append(' '.join(ngram_list))
        return word_ngram 
Example 61
Project: multiwoz   Author: budzianowski   File: nlp.py    MIT License 5 votes vote down vote up
def sentence_bleu_4(hyp, refs, weights=[0.25, 0.25, 0.25, 0.25]):
    # input : single sentence, multiple references
    count = [0, 0, 0, 0]
    clip_count = [0, 0, 0, 0]
    r = 0
    c = 0

    for i in range(4):
        hypcnts = Counter(ngrams(hyp, i + 1))
        cnt = sum(hypcnts.values())
        count[i] += cnt

        # compute clipped counts
        max_counts = {}
        for ref in refs:
            refcnts = Counter(ngrams(ref, i + 1))
            for ng in hypcnts:
                max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
        clipcnt = dict((ng, min(count, max_counts[ng])) \
                       for ng, count in hypcnts.items())
        clip_count[i] += sum(clipcnt.values())

    bestmatch = [1000, 1000]
    for ref in refs:
        if bestmatch[0] == 0:
            break
        diff = abs(len(ref) - len(hyp))
        if diff < bestmatch[0]:
            bestmatch[0] = diff
            bestmatch[1] = len(ref)
    r = bestmatch[1]
    c = len(hyp)

    p0 = 1e-7
    bp = math.exp(-abs(1.0 - float(r) / float(c + p0)))

    p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 for i in range(4)]
    s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns) if p_n)
    bleu_hyp = bp * math.exp(s)

    return bleu_hyp 
Example 62
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example 63
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_freq_filter(self, min_freq):
        """Removes candidate ngrams which have frequency less than min_freq."""
        self._apply_filter(lambda ng, freq: freq < min_freq) 
Example 64
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_ngram_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
        evaluates to True.
        """
        self._apply_filter(lambda ng, f: fn(*ng)) 
Example 65
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def apply_word_filter(self, fn):
        """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
        ...) evaluates to True.
        """
        self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) 
Example 66
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def nbest(self, score_fn, n):
        """Returns the top n ngrams when scored by the given function."""
        return [p for p, s in self.score_ngrams(score_fn)[:n]] 
Example 67
Project: FancyWord   Author: EastonLee   File: collocations.py    GNU General Public License v3.0 5 votes vote down vote up
def from_words(cls, words, window_size=4):
        if window_size < 4:
            raise ValueError("Specify window_size at least 4")
        ixxx = FreqDist()
        iiii = FreqDist()
        ii = FreqDist()
        iii = FreqDist()
        ixi = FreqDist()
        ixxi = FreqDist()
        iixi = FreqDist()
        ixii = FreqDist()

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            for w2, w3, w4 in _itertools.combinations(window[1:], 3):
                ixxx[w1] += 1
                if w2 is None:
                    continue
                ii[(w1, w2)] += 1
                if w3 is None:
                    continue
                iii[(w1, w2, w3)] += 1
                ixi[(w1, w3)] += 1
                if w4 is None:
                    continue
                iiii[(w1, w2, w3, w4)] += 1
                ixxi[(w1, w4)] += 1
                ixii[(w1, w3, w4)] += 1
                iixi[(w1, w2, w4)] += 1

        return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) 
Example 68
Project: werika   Author: pywirrarika   File: confgen.py    GNU General Public License v3.0 5 votes vote down vote up
def eval_text(lm, text):
    tokens = word_tokenize(text)
    chain = []
    for word in tokens:
        w = list(word.lower())

        # Dividing in 2-grams
        bgs = ngrams(w, 4)
        w = []
        for bg in bgs:
            w.append(bg[0]+bg[1])
            w.append(bg[2]+bg[3])

        p = 0
        i = 0
        for i in range(len(w)-1):
            p = float(p) + eval_pair(lm, (w[i], w[i+1]))
        p = float(p) / float(i+1)

        #If the word exists in the common word list, then p = .5
        # .5 is an arbitrary value
        if word in common:
            p = 0.2

        chain.append((p, word))
    return chain 
Example 69
Project: GCN-SeA   Author: sumanbanerjee1   File: nlp.py    Apache License 2.0 4 votes vote down vote up
def score(self, hypothesis, corpus, n=1):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in zip(hypothesis, corpus):
            if type(hyps[0]) is list:
                hyps = [hyp.split() for hyp in hyps[0]]
            else:
                hyps = [hyp.split() for hyp in hyps]

            refs = [ref.split() for ref in refs]

            # Shawn's evaluation
            refs[0] = [u'GO_'] + refs[0] + [u'EOS_']
            hyps[0] = [u'GO_'] + hyps[0] + [u'EOS_']

            for idx, hyp in enumerate(hyps):
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)
                if n == 1:
                    break
        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 70
Project: SEDST   Author: AuCson   File: metric.py    MIT License 4 votes vote down vote up
def score(self, parallel_corpus):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in parallel_corpus:
            hyps = [hyp.split() for hyp in hyps]
            refs = [ref.split() for ref in refs]
            for hyp in hyps:

                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)

        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 71
Project: ConvLab   Author: ConvLab   File: metric.py    MIT License 4 votes vote down vote up
def score(self, parallel_corpus):

        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in parallel_corpus:
            hyps = [hyp.split() for hyp in hyps]
            refs = [ref.split() for ref in refs]
            for hyp in hyps:

                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)

        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 72
Project: ConvLab   Author: ConvLab   File: nlp.py    MIT License 4 votes vote down vote up
def score(self, hypothesis, corpus, n=1):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in zip(hypothesis, corpus):
            if isinstance(hyps[0], list):
                hyps = [hyp.split() for hyp in hyps[0]]
            else:
                hyps = [hyp.split() for hyp in hyps]

            refs = [ref.split() for ref in refs]

            # Shawn's evaluation
            refs[0] = [u'GO_'] + refs[0] + [u'EOS_']
            hyps[0] = [u'GO_'] + hyps[0] + [u'EOS_']

            for idx, hyp in enumerate(hyps):
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)
                if n == 1:
                    break
        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 73
Project: ConvLab   Author: ConvLab   File: evaluators.py    MIT License 4 votes vote down vote up
def score(self, hypothesis, corpus, n=1):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in zip(hypothesis, corpus):
            # if type(hyps[0]) is list:
            #    hyps = [hyp.split() for hyp in hyps[0]]
            # else:
            #    hyps = [hyp.split() for hyp in hyps]

            # refs = [ref.split() for ref in refs]
            hyps = [hyps]
            # Shawn's evaluation
            # refs[0] = [u'GO_'] + refs[0] + [u'EOS_']
            # hyps[0] = [u'GO_'] + hyps[0] + [u'EOS_']

            for idx, hyp in enumerate(hyps):
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)
                if n == 1:
                    break
        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 74
Project: ConvLab   Author: ConvLab   File: nlp.py    MIT License 4 votes vote down vote up
def score(self, hypothesis, corpus, n=1):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in zip(hypothesis, corpus):
            if isinstance(hyps[0], list):
                hyps = [hyp.split() for hyp in hyps[0]]
            else:
                hyps = [hyp.split() for hyp in hyps]

            refs = [ref.split() for ref in refs]

            # Shawn's evaluation
            refs[0] = [u'GO_'] + refs[0] + [u'EOS_']
            hyps[0] = [u'GO_'] + hyps[0] + [u'EOS_']

            for idx, hyp in enumerate(hyps):
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)
                if n == 1:
                    break
        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu 
Example 75
Project: twitter_analysis   Author: urmilkadakia   File: analysis_methods.py    Apache License 2.0 4 votes vote down vote up
def daily_ngram_collector(input_file_folder_path, output_file, number_of_users, start_date, end_date, n=1, cutoff_freq=5,
                          alpha_numeric_flag=0, stop_words_flag=0):
    """
    The function reads all the files generated between start date and end date and counts the ngram frequencies for all
    the ngrams in the file and finally combine them all in a date vise sorted csv file.
    :param input_file_folder_path: Path to the folder in which input files are stored
    :param output_file: Path to the output file
    :param number_of_users: To identify the input file as they are named based on the number of users
    :param start_date: Date from which function will start to calculate the ngram frequencies
    :param end_date: Date up to which function will calculate the ngram frequencies
    :param n: n represents the n in n-gram which is a contiguous sequence of n items. The default vale is 1 which
              represents unigram.
    :param cutoff_freq: The ngrams that has less frequency than the cut off frequency will not be included in the
                        output file. The default value is 5.
    :param alpha_numeric_flag: filter all non alpha numeric words. Default is false.
    :param stop_words_flag: filter all stop words. Default is false.
    """

    curr_date = start_date

    end_date = dt.strptime(end_date, '%Y_%m_%d')
    end_date += datetime.timedelta(days=1)
    end_date = dt.strftime(end_date, '%Y_%m_%d')

    while curr_date != end_date:
        input_f = os.path.join(input_file_folder_path, curr_date + '_profiles_' + str(number_of_users) + '.zip')
        if os.path.exists(input_f):
            ngram_freq = count_ngram_frequency(input_f, n, alpha_numeric_flag, stop_words_flag)
            ngram_freq = ngram_freq.most_common()

            # Creating the new row to add to the daily collector file
            # new_row1 = {'Date': re.findall(r'[0-9]{4}_[0-9]{2}_[0-9]{2}', input_f)[0]}
            new_row1 = {'Date': curr_date}
            # Extracting the Date from the filename
            for item, val in ngram_freq:
                if n == 1:
                    new_row1[item[0]] = [val]
                else:
                    new_row1[item] = [val]
            new_row = pd.DataFrame(new_row1)

            new_row1 = pd.DataFrame()
            for col in list(new_row.columns):
                if col == 'Date':
                    new_row1[str(col)] = new_row[col]
                    continue
                if new_row[col][0] > cutoff_freq:
                    new_row1[str(col)] = new_row[col]
            # Checking the file exist or not
            # If not then generate a new one or append the line at the end of the file
            if not os.path.exists(output_file):
                ngram_combined = new_row1
            else:
                ngram_original = pd.read_csv(output_file, index_col=0)
                ngram_combined = pd.concat([ngram_original, new_row1], sort=False, ignore_index=True, axis=0)
            ngram_combined.replace(np.nan, 0, inplace=True)
            ngram_combined.to_csv(output_file)

        curr_date = dt.strptime(curr_date, '%Y_%m_%d')
        curr_date += datetime.timedelta(days=1)
        curr_date = dt.strftime(curr_date, '%Y_%m_%d') 
Example 76
Project: twitter_analysis   Author: urmilkadakia   File: analysis_methods.py    Apache License 2.0 4 votes vote down vote up
def ngram_adjacency_matrix(input_file, output_file, n=1, cut_off=5, alpha_numeric_flag=False, stop_words_flag=False):
    """
    The function writes the adjacency matrix to the output file, where the rows and columns are ngram and each cell is
    the number of users that has both the ngram in their description.
    :param input_file: Path to input file
    :param output_file: Path to output file
    :param n: n represents the n in n-gram which is a contiguous sequence of n items. The default vale is 1 which
              represents unigram.
    :param cut_off: The ngrams that has less frequency than the cut off frequency will not be included in the
                    output file. The default value is 5.
    :param alpha_numeric_flag: filter all non alpha numeric words. Default is false.
    :param stop_words_flag: filter all stop words. Default is false.
    """
    ngram_freq = count_ngram_frequency(input_file, n, alpha_numeric_flag, stop_words_flag)

    for ngram in list(ngram_freq):
        if ngram_freq[ngram] < cut_off:
            del ngram_freq[ngram]

    matrix = pd.DataFrame(np.zeros((len(ngram_freq), len(ngram_freq))), columns=ngram_freq, index=ngram_freq)

    with zipfile.ZipFile(input_file, 'r') as z:
        for filename in z.namelist():
            with z.open(filename) as f:
                json_list = json.load(f)

    for user in json_list:
        text = user['description']

        ngram_list = _get_ngram_list(text, n, alpha_numeric_flag, stop_words_flag)
        for i in ngram_list:
            for j in ngram_list:
                try:
                    matrix[i][j] += 1
                except KeyError:
                    continue
    drop_row = []
    for i, row in matrix.iterrows():
        if any(j > cut_off for j in row):
            continue
        else:
            drop_row.append(i)
    matrix.drop(drop_row, inplace=True)

    drop_col = []
    for col in matrix:
        if any(j > cut_off for j in matrix[col]):
            continue
        else:
            drop_col.append(col)
    matrix.drop(columns=drop_col, inplace=True)

    matrix.to_csv(output_file) 
Example 77
Project: OpenBottle   Author: xiaozhuchacha   File: bleu_score.py    MIT License 4 votes vote down vote up
def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
                  smoothing_function=None, auto_reweigh=False):
    """
    Calculate BLEU score (Bilingual Evaluation Understudy) from
    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
    "BLEU: a method for automatic evaluation of machine translation."
    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf

    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
    0.5045...

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
    0.3969...

    The default BLEU calculates a score for up to 4grams using uniform
    weights. To evaluate your translations with higher/lower order ngrams,
    use customized weights. E.g. when accounting for up to 6grams with uniform
    weights:

    >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
    0.4583...

    :param references: reference sentences
    :type references: list(list(str))
    :param hypothesis: a hypothesis sentence
    :type hypothesis: list(str)
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :return: The sentence-level BLEU score.
    :rtype: float
    """
    return corpus_bleu([references], [hypothesis],
                        weights, smoothing_function, auto_reweigh) 
Example 78
Project: OpenBottle   Author: xiaozhuchacha   File: bleu_score.py    MIT License 4 votes vote down vote up
def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
                  smoothing_function=None, auto_reweigh=False):
    """
    Calculate BLEU score (Bilingual Evaluation Understudy) from
    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
    "BLEU: a method for automatic evaluation of machine translation."
    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf

    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
    0.5045...

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
    0.3969...

    The default BLEU calculates a score for up to 4grams using uniform
    weights. To evaluate your translations with higher/lower order ngrams,
    use customized weights. E.g. when accounting for up to 6grams with uniform
    weights:

    >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
    0.4583...

    :param references: reference sentences
    :type references: list(list(str))
    :param hypothesis: a hypothesis sentence
    :type hypothesis: list(str)
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :return: The sentence-level BLEU score.
    :rtype: float
    """
    return corpus_bleu([references], [hypothesis],
                        weights, smoothing_function, auto_reweigh) 
Example 79
Project: NMT-RDPG   Author: MultiPath   File: bleu.py    MIT License 4 votes vote down vote up
def sentence_bleu(references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25),
                  smoothing_function=None):
    """
    Calculate BLEU score (Bilingual Evaluation Understudy) from
    Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
    "BLEU: a method for automatic evaluation of machine translation."
    In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf

    >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...               'ensures', 'that', 'the', 'military', 'always',
    ...               'obeys', 'the', 'commands', 'of', 'the', 'party']

    >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
    ...               'forever', 'hearing', 'the', 'activity', 'guidebook',
    ...               'that', 'party', 'direct']

    >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...               'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...               'heed', 'Party', 'commands']

    >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...               'guarantees', 'the', 'military', 'forces', 'always',
    ...               'being', 'under', 'the', 'command', 'of', 'the',
    ...               'Party']

    >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...               'army', 'always', 'to', 'heed', 'the', 'directions',
    ...               'of', 'the', 'party']

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
    0.5045...

    >>> sentence_bleu([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS
    0.3969...

    The default BLEU calculates a score for up to 4grams using uniform
    weights. To evaluate your translations with higher/lower order ngrams,
    use customized weights. E.g. when accounting for up to 6grams with uniform
    weights:

    >>> weights = (0.1666, 0.1666, 0.1666, 0.1666, 0.1666)
    >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights)
    0.45838627164939455

    :param references: reference sentences
    :type references: list(list(str))
    :param hypothesis: a hypothesis sentence
    :type hypothesis: list(str)
    :param weights: weights for unigrams, bigrams, trigrams and so on
    :type weights: list(float)
    :return: The sentence-level BLEU score.
    :rtype: float
    """
    return corpus_bleu([references], [hypothesis], weights, smoothing_function) 
Example 80
Project: multiwoz   Author: budzianowski   File: nlp.py    MIT License 4 votes vote down vote up
def score(self, hypothesis, corpus, n=1):
        # containers
        count = [0, 0, 0, 0]
        clip_count = [0, 0, 0, 0]
        r = 0
        c = 0
        weights = [0.25, 0.25, 0.25, 0.25]

        # accumulate ngram statistics
        for hyps, refs in zip(hypothesis, corpus):
            if type(hyps[0]) is list:
                hyps = [hyp.split() for hyp in hyps[0]]
            else:
                hyps = [hyp.split() for hyp in hyps]

            refs = [ref.split() for ref in refs]

            # Shawn's evaluation
            refs[0] = [u'GO_'] + refs[0] + [u'EOS_']
            hyps[0] = [u'GO_'] + hyps[0] + [u'EOS_']

            for idx, hyp in enumerate(hyps):
                for i in range(4):
                    # accumulate ngram counts
                    hypcnts = Counter(ngrams(hyp, i + 1))
                    cnt = sum(hypcnts.values())
                    count[i] += cnt

                    # compute clipped counts
                    max_counts = {}
                    for ref in refs:
                        refcnts = Counter(ngrams(ref, i + 1))
                        for ng in hypcnts:
                            max_counts[ng] = max(max_counts.get(ng, 0), refcnts[ng])
                    clipcnt = dict((ng, min(count, max_counts[ng])) \
                                   for ng, count in hypcnts.items())
                    clip_count[i] += sum(clipcnt.values())

                # accumulate r & c
                bestmatch = [1000, 1000]
                for ref in refs:
                    if bestmatch[0] == 0: break
                    diff = abs(len(ref) - len(hyp))
                    if diff < bestmatch[0]:
                        bestmatch[0] = diff
                        bestmatch[1] = len(ref)
                r += bestmatch[1]
                c += len(hyp)
                if n == 1:
                    break
        # computing bleu score
        p0 = 1e-7
        bp = 1 if c > r else math.exp(1 - float(r) / float(c))
        p_ns = [float(clip_count[i]) / float(count[i] + p0) + p0 \
                for i in range(4)]
        s = math.fsum(w * math.log(p_n) \
                      for w, p_n in zip(weights, p_ns) if p_n)
        bleu = bp * math.exp(s)
        return bleu