Python nltk.probability.FreqDist() Examples

The following are 30 code examples of nltk.probability.FreqDist(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.probability , or try the search function .
Example #1
Source File: decisiontree.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label) in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1

        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default) 
Example #2
Source File: decisiontree.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist(label for (featureset, label)
                         in labeled_featuresets).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist[label] += 1
            else:
                neg_fdist[label] += 1


        decisions = {}
        default = label
        # But hopefully we have observations!
        if pos_fdist.N() > 0:
            decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        if neg_fdist.N() > 0:
            default = DecisionTreeClassifier(neg_fdist.max())

        return DecisionTreeClassifier(label, feature_name, decisions, default) 
Example #3
Source File: punkt.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok in fdist:
            count = fdist[tok]
            if count < threshold:
                num_removed += 1
            else:
                res[tok] += count
        res[None] += num_removed
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #//////////////////////////////////////////////////////////// 
Example #4
Source File: collocations.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example #5
Source File: eval_utils.py    From tf-var-attention with MIT License 6 votes vote down vote up
def calculate_ngram_diversity(corpus):
    """
    Calculates unigram and bigram diversity

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score

    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity 
Example #6
Source File: eval_utils.py    From tf-var-attention with MIT License 6 votes vote down vote up
def calculate_entropy(corpus):
    """
    Calculates diversity in terms of entropy (using unigram probability)

    Args:
        corpus: tokenized list of sentences sampled

    Returns:
        ent: entropy on the sample sentence list

    """
    fdist = FreqDist(corpus)
    total_len = len(corpus)
    ent = 0
    for k, v in fdist.items():
        p = v / total_len

        ent += -p * np.log(p)

    return ent 
Example #7
Source File: model.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist() 
Example #8
Source File: transformer.py    From atap with Apache License 2.0 6 votes vote down vote up
def transform(self, documents):
        words = []
        docs = []
        for document in documents:
            docs.append(document)
            for para in document:
                for sent in para:
                    for token, tag in sent:
                        words.append(token)

        counts = FreqDist(words)
        self.reduced = set(
            w for w in words if counts[w] > self.min and counts[w] < self.max
        )

        return [
            ' '.join(self.normalize(doc)) for doc in docs
        ] 
Example #9
Source File: agreement.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in label_freqs.iteritems():
                for l, nl in label_freqs.iteritems():
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret 
Example #10
Source File: punkt.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok, count in fdist.iteritems():
            if count < threshold:
                num_removed += 1
            else:
                res.inc(tok, count)
        res.inc(None, num_removed)
        return res

    #////////////////////////////////////////////////////////////
    #{ Orthographic data
    #//////////////////////////////////////////////////////////// 
Example #11
Source File: collocations.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            try:
                window = window[:list(window).index(w1, 1)]
            except ValueError:
                pass
            wfd.inc(w1)
            for w2 in set(window[1:]):
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd) 
Example #12
Source File: decisiontree.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def binary_stump(feature_name, feature_value, labeled_featuresets):
        label = FreqDist([label for (featureset,label)
                          in labeled_featuresets]).max()

        # Find the best label for each value.
        pos_fdist = FreqDist()
        neg_fdist = FreqDist()
        for featureset, label in labeled_featuresets:
            if featureset.get(feature_name) == feature_value:
                pos_fdist.inc(label)
            else:
                neg_fdist.inc(label)

        decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
        default = DecisionTreeClassifier(neg_fdist.max())
        return DecisionTreeClassifier(label, feature_name, decisions, default) 
Example #13
Source File: agreement.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in iteritems(label_freqs):
                for l, nl in iteritems(label_freqs):
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret 
Example #14
Source File: sentiment_analyzer.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def unigram_word_feats(self, words, top_n=None, min_freq=0):
        """
        Return most common top_n word features.

        :param words: a list of words/tokens.
        :param top_n: number of best words/tokens to use, sorted by frequency.
        :rtype: list(str)
        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
            frequency.
        """
        # Stopwords are not removed
        unigram_feats_freqs = FreqDist(word for word in words)
        return [
            w
            for w, f in unigram_feats_freqs.most_common(top_n)
            if unigram_feats_freqs[w] > min_freq
        ] 
Example #15
Source File: collocations.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def from_words(cls, words, window_size=3):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        if window_size < 3:
            raise ValueError("Specify window_size at least 3")

        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()
        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            for w2, w3 in _itertools.combinations(window[1:], 2):
                wfd[w1] += 1
                if w2 is None:
                    continue
                bfd[(w1, w2)] += 1
                if w3 is None:
                    continue
                wildfd[(w1, w3)] += 1
                tfd[(w1, w2, w3)] += 1
        return cls(wfd, bfd, wildfd, tfd) 
Example #16
Source File: collocations.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ngrams(words, window_size, pad_right=True):
            w1 = window[0]
            if w1 is None:
                continue
            wfd[w1] += 1
            for w2 in window[1:]:
                if w2 is not None:
                    bfd[(w1, w2)] += 1
        return cls(wfd, bfd, window_size=window_size) 
Example #17
Source File: punkt.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def _freq_threshold(self, fdist, threshold):
        """
        Returns a FreqDist containing only data with counts below a given
        threshold, as well as a mapping (None -> count_removed).
        """
        # We assume that there is more data below the threshold than above it
        # and so create a new FreqDist rather than working in place.
        res = FreqDist()
        num_removed = 0
        for tok in fdist:
            count = fdist[tok]
            if count < threshold:
                num_removed += 1
            else:
                res[tok] += count
        res[None] += num_removed
        return res

    # ////////////////////////////////////////////////////////////
    # { Orthographic data
    # //////////////////////////////////////////////////////////// 
Example #18
Source File: collocations_app.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def run(self):
            try:
                words = self.model.CORPORA[self.name]()
                from operator import itemgetter

                text = [w for w in words if len(w) > 2]
                fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1))
                vocab = FreqDist(text)
                scored = [
                    ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2]))
                    for w1, w2 in fd
                ]
                scored.sort(key=itemgetter(1), reverse=True)
                self.model.collocations = list(map(itemgetter(0), scored))
                self.model.queue.put(CORPUS_LOADED_EVENT)
            except Exception as e:
                print(e)
                self.model.queue.put(ERROR_LOADING_CORPUS_EVENT)


# def collocations():
#    colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] 
Example #19
Source File: read_data.py    From CommonSenseMultiHopQA with MIT License 6 votes vote down vote up
def get_stop_words_1(data, num_stop_words):
    total_words = []
    for d in data:
       total_words.extend(d["ques"])
       total_words.extend(d["answer1"])
       for d_i in d["summary"]:
           total_words.extend(d_i)
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words 
Example #20
Source File: general.py    From CommonSenseMultiHopQA with MIT License 6 votes vote down vote up
def sample_relations_top_n(graph, context, type_):
    num_total_words = len(context)
    dist = FreqDist(context)
 
    for node in graph:
        node = build_score_per_layer(node, dist, num_total_words)

    for node in graph:
        node = calc_top_n_score_by_level(node)

    for i, node in enumerate(graph):
        graph[i] = prune_graph_by_top_n_softmax(node)

    selected_paths = select_paths(graph) 
    paths = build_subpaths(selected_paths)
    final_paths = list(paths for paths, _ in itertools.groupby(paths))
    random.shuffle(final_paths)
    return final_paths 
Example #21
Source File: metric.py    From MultiTurnDialogZoo with MIT License 6 votes vote down vote up
def cal_Distinct(corpus):
    """
    Calculates unigram and bigram diversity
    Args:
        corpus: tokenized list of sentences sampled
    Returns:
        uni_diversity: distinct-1 score
        bi_diversity: distinct-2 score
    """
    bigram_finder = BigramCollocationFinder.from_words(corpus)
    bi_diversity = len(bigram_finder.ngram_fd) / bigram_finder.N

    dist = FreqDist(corpus)
    uni_diversity = len(dist) / len(corpus)

    return uni_diversity, bi_diversity 
Example #22
Source File: collocations.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def from_words(cls, words):
        """Construct a TrigramCollocationFinder for all trigrams in the given
        sequence.
        """
        wfd = FreqDist()
        wildfd = FreqDist()
        bfd = FreqDist()
        tfd = FreqDist()

        for w1, w2, w3 in ingrams(words, 3, pad_right=True):
            wfd.inc(w1)
            if w2 is None:
                continue
            bfd.inc((w1, w2))
            if w3 is None:
                continue
            wildfd.inc((w1, w3))
            tfd.inc((w1, w2, w3))
        return cls(wfd, bfd, wildfd, tfd) 
Example #23
Source File: agreement.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def pi(self):
        """Scott 1955; here, multi-pi.
        Equivalent to K from Siegel and Castellan (1988).

        """
        total = 0.0
        label_freqs = FreqDist(x['labels'] for x in self.data)
        for k, f in iteritems(label_freqs):
            total += f ** 2
        Ae = total / ((len(self.I) * len(self.C)) ** 2)
        return (self.avg_Ao() - Ae) / (1 - Ae) 
Example #24
Source File: agreement.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def alpha(self):
        """Krippendorff 1980

        """
        # check for degenerate cases
        if len(self.K) == 0:
            raise ValueError("Cannot calculate alpha, no data present!")
        if len(self.K) == 1:
            log.debug("Only one annotation value, allpha returning 1.")
            return 1
        if len(self.C) == 1 and len(self.I) == 1:
            raise ValueError("Cannot calculate alpha, only one coder and item present!")

        total_disagreement = 0.0
        total_ratings = 0
        all_valid_labels_freq = FreqDist([])

        total_do = 0.0 # Total observed disagreement for all items.
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)
            labels_count = sum(label_freqs.values())
            if labels_count < 2:
                # Ignore the item.
                continue
            all_valid_labels_freq += label_freqs
            total_do += self.Disagreement(label_freqs) * labels_count

        do = total_do / sum(all_valid_labels_freq.values())

        de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
        k_alpha = 1.0 - do / de

        return k_alpha 
Example #25
Source File: read_data.py    From CommonSenseMultiHopQA with MIT License 5 votes vote down vote up
def get_stop_words(total_words, num_stop_words):
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words 
Example #26
Source File: general.py    From CommonSenseMultiHopQA with MIT License 5 votes vote down vote up
def build_trees_one_hop(definitions, query, freq_words, context):
    context_string = ' '.join(context)
    num_total_words = len(context)
    query = [q.lower() for q in query if (q not in freq_words and q in definitions)]
    dist = FreqDist(context)
    graph = []

    for q in query:
        for (rel, w_2) in definitions[q]:
            if check_context(w_2, q, context_string, context, freq_words):
                new_vertex_1, graph, parent_vertex = is_new_vertex(graph, q, w_2, rel, 1, None)
                if new_vertex_1:
                    vertex_1 = create_vertex(q, w_2, None, 1, rel, definitions)
                    graph.append(vertex_1)
    return graph 
Example #27
Source File: recall_model.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ",fDist.N())
    print("不同单词数: ",fDist.B())
    fDist.plot(10) 
Example #28
Source File: collocations.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def _apply_filter(self, fn=lambda ngram, freq: False):
        """Generic filter removes ngrams from the frequency distribution
        if the function returns True when passed an ngram tuple.
        """
        tmp_ngram = FreqDist()
        for ngram, freq in iteritems(self.ngram_fd):
            if not fn(ngram, freq):
                tmp_ngram[ngram] = freq
        self.ngram_fd = tmp_ngram 
Example #29
Source File: recall_model.py    From Customer-Chatbot with MIT License 5 votes vote down vote up
def plot_words(wordList):
    fDist = FreqDist(wordList)
    #print(fDist.most_common())
    print("单词总数: ",fDist.N())
    print("不同单词数: ",fDist.B())
    fDist.plot(10) 
Example #30
Source File: collocations.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def _ngram_freqdist(words, n):
        return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1))