Python nltk.probability.ConditionalFreqDist() Examples

The following are 19 code examples of nltk.probability.ConditionalFreqDist(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.probability , or try the search function .
Example #1
Source File: agreement.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
                for x in self.data
                if x['coder'] in (cA, cB))
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret 
Example #2
Source File: agreement.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist(
            (x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)
        )
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret 
Example #3
Source File: model.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, n, vocabulary, unknown="<UNK>"):
        """
        n is the size of the ngram
        """
        if n < 1:
            raise ValueError("ngram size must be greater than or equal to 1")

        self.n = n
        self.unknown = unknown
        self.padding = {
            "pad_left": True,
            "pad_right": True,
            "left_pad_symbol": "<s>",
            "right_pad_symbol": "</s>"
        }

        self.vocabulary = vocabulary
        self.allgrams = defaultdict(ConditionalFreqDist)
        self.ngrams = FreqDist()
        self.unigrams = FreqDist() 
Example #4
Source File: agreement.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0):
        """Cohen 1968

        """
        total = 0.0
        label_freqs = ConditionalFreqDist((x['coder'], x['labels'])
                for x in self.data
                if x['coder'] in (cA, cB))
        for j in self.K:
            for l in self.K:
                total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l)
        De = total / (max_distance * pow(len(self.I), 2))
        log.debug("Expected disagreement between %s and %s: %f", cA, cB, De)
        Do = self.Do_Kw_pairwise(cA, cB)
        ret = 1.0 - (Do / De)
        return ret 
Example #5
Source File: agreement.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae 
Example #6
Source File: counter.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, ngram_text=None):
        """Creates a new NgramCounter.

        If `ngram_text` is specified, counts ngrams from it, otherwise waits for
        `update` method to be called explicitly.

        :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
        :type ngram_text: Iterable(Iterable(tuple(str))) or None

        """
        self._counts = defaultdict(ConditionalFreqDist)
        self._counts[1] = self.unigrams = FreqDist()

        if ngram_text:
            self.update(ngram_text) 
Example #7
Source File: text.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
        )
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
        ) 
Example #8
Source File: agreement.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae 
Example #9
Source File: text.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                     for i, w in enumerate(tokens))
        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                     for i, w in enumerate(tokens)) 
Example #10
Source File: svm_train.py    From weiboanalysis with Apache License 2.0 5 votes vote down vote up
def pynlpir_feature(number):  # 选取number个特征词
    normalWords = []
    advWords = []
    for items in read_file('ad/normal.txt'):  # 把集合的集合变成集合
        for item in items:
            normalWords.append(item)
    for items in read_file('ad/advertise.txt'):
        for item in items:
            advWords.append(item)
    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计正常文本中的词频和广告文本中的词频
    for word in normalWords:
        word_fd[word] += 1
        cond_word_fd['normal'][word] += 1
    for word in advWords:
        word_fd[word] += 1
        cond_word_fd['adv'][word] += 1
    normal_word_count = cond_word_fd['normal'].N()  # 正常词的数量
    adv_word_count = cond_word_fd['adv'].N()  # 广告词的数量
    total_word_count = normal_word_count + adv_word_count
    word_scores = {}  # 包括了每个词和这个词的信息量
    for word, freq in word_fd.items():
        # 计算正常词的卡方统计量,这里也可以计算互信息等其它统计量
        normal_score = BigramAssocMeasures.chi_sq(cond_word_fd['normal'][word],
                                                  (freq, normal_word_count),
                                                  total_word_count)
        adv_score = BigramAssocMeasures.chi_sq(cond_word_fd['adv'][word],
                                               (freq, adv_word_count),
                                               total_word_count)  # 同理
        # 一个词的信息量等于正常卡方统计量加上广告卡方统计量
        word_scores[word] = normal_score + adv_score
    best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[
                :number]  # 把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的
    # χ²=∑(Oi-Ei)/Ei~χ²(k-1)
    # i=1~k
    # Oi是观测值
    # Ei是期望值
    # 统计量大于临界值时,拒绝原假设

    best_words = set([w for w, s in best_vals])
    return dict([(word, True) for word in best_words]) 
Example #11
Source File: sentiment_analysis.py    From edx_data_research with MIT License 5 votes vote down vote up
def create_word_scores():
	#creates lists of all positive and negative words
	posWords = []
	negWords = []
	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
		for i in posSentences:
			posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			posWords.append(posWord)
	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
		for i in negSentences:
			negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
			negWords.append(negWord)
	posWords = list(itertools.chain(*posWords))
	negWords = list(itertools.chain(*negWords))

	#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
	word_fd = FreqDist()
	cond_word_fd = ConditionalFreqDist()
	for word in posWords:
		word_fd.inc(word.lower())
		cond_word_fd['pos'].inc(word.lower())
	for word in negWords:
		word_fd.inc(word.lower())
		cond_word_fd['neg'].inc(word.lower())

	#finds the number of positive and negative words, as well as the total number of words
	pos_word_count = cond_word_fd['pos'].N()
	neg_word_count = cond_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count
    
    #builds dictionary of word scores based on chi-squared test
	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score

	return word_scores

#finds word scores 
Example #12
Source File: text.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
                                     for i, w in enumerate(tokens))
        self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
                                     for i, w in enumerate(tokens)) 
Example #13
Source File: agreement.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def Ae_kappa(self, cA, cB):
        Ae = 0.0
        nitems = float(len(self.I))
        label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data)
        for k in label_freqs.conditions():
            Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems)
        return Ae 
Example #14
Source File: ensemble.py    From cltk with MIT License 4 votes vote down vote up
def _train(self, tagged_corpus: list, cutoff: int = 0, verbose: bool = False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag) tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        :param verbose: Not used
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None:
                    continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if self.backoff is None or tag != self.backoff.tag_one(
                    tokens, index, tags[:index]
                ):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max() # Remove
            weighted_tags = [(k, v/sum(fd[context].values())) for k, v in fd[context].items()]
            hits = fd[context][best_tag] #INT
            if hits > cutoff:
                self._context_to_tag[context] = weighted_tags
                hit_count += hits 
Example #15
Source File: tnt.py    From luscan-devel with GNU General Public License v2.0 4 votes vote down vote up
def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0 
Example #16
Source File: sequential.py    From luscan-devel with GNU General Public License v2.0 4 votes vote down vote up
def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context].inc(tag)
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print "[Trained Unigram tagger:",
            print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning)

######################################################################
#{ Tagger Classes
###################################################################### 
Example #17
Source File: sequential.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def _train(self, tagged_corpus, cutoff=0, verbose=False):
        """
        Initialize this ContextTagger's ``_context_to_tag`` table
        based on the given training data.  In particular, for each
        context ``c`` in the training data, set
        ``_context_to_tag[c]`` to the most frequent tag for that
        context.  However, exclude any contexts that are already
        tagged perfectly by the backoff tagger(s).

        The old value of ``self._context_to_tag`` (if any) is discarded.

        :param tagged_corpus: A tagged corpus.  Each item should be
            a list of (word, tag tuples.
        :param cutoff: If the most likely tag for a context occurs
            fewer than cutoff times, then exclude it from the
            context-to-tag table for the new tagger.
        """

        token_count = hit_count = 0

        # A context is considered 'useful' if it's not already tagged
        # perfectly by the backoff tagger.
        useful_contexts = set()

        # Count how many times each tag occurs in each context.
        fd = ConditionalFreqDist()
        for sentence in tagged_corpus:
            tokens, tags = zip(*sentence)
            for index, (token, tag) in enumerate(sentence):
                # Record the event.
                token_count += 1
                context = self.context(tokens, index, tags[:index])
                if context is None: continue
                fd[context][tag] += 1
                # If the backoff got it wrong, this context is useful:
                if (self.backoff is None or
                    tag != self.backoff.tag_one(tokens, index, tags[:index])):
                    useful_contexts.add(context)

        # Build the context_to_tag table -- for each context, figure
        # out what the most likely tag is.  Only include contexts that
        # we've seen at least `cutoff` times.
        for context in useful_contexts:
            best_tag = fd[context].max()
            hits = fd[context][best_tag]
            if hits > cutoff:
                self._context_to_tag[context] = best_tag
                hit_count += hits

        # Display some stats, if requested.
        if verbose:
            size = len(self._context_to_tag)
            backoff = 100 - (hit_count * 100.0)/ token_count
            pruning = 100 - (size * 100.0) / len(fd.conditions())
            print("[Trained Unigram tagger:", end=' ')
            print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (
                size, backoff, pruning))

######################################################################
#{ Tagger Classes
###################################################################### 
Example #18
Source File: tnt.py    From razzy-spinner with GNU General Public License v3.0 4 votes vote down vote up
def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni  = FreqDist()
        self._bi   = ConditionalFreqDist()
        self._tri  = ConditionalFreqDist()
        self._wd   = ConditionalFreqDist()
        self._eos  = ConditionalFreqDist()
        self._l1   = 0.0
        self._l2   = 0.0
        self._l3   = 0.0
        self._N    = N
        self._C    = C
        self._T    = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0 
Example #19
Source File: tnt.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def __init__(self, unk=None, Trained=False, N=1000, C=False):
        '''
        Construct a TnT statistical tagger. Tagger must be trained
        before being used to tag input.

        :param unk: instance of a POS tagger, conforms to TaggerI
        :type  unk:(TaggerI)
        :param Trained: Indication that the POS tagger is trained or not
        :type  Trained: boolean
        :param N: Beam search degree (see above)
        :type  N:(int)
        :param C: Capitalization flag
        :type  C: boolean

        Initializer, creates frequency distributions to be used
        for tagging

        _lx values represent the portion of the tri/bi/uni taggers
        to be used to calculate the probability

        N value is the number of possible solutions to maintain
        while tagging. A good value for this is 1000

        C is a boolean value which specifies to use or
        not use the Capitalization of the word as additional
        information for tagging.
        NOTE: using capitalization may not increase the accuracy
        of the tagger
        '''

        self._uni = FreqDist()
        self._bi = ConditionalFreqDist()
        self._tri = ConditionalFreqDist()
        self._wd = ConditionalFreqDist()
        self._eos = ConditionalFreqDist()
        self._l1 = 0.0
        self._l2 = 0.0
        self._l3 = 0.0
        self._N = N
        self._C = C
        self._T = Trained

        self._unk = unk

        # statistical tools (ignore or delete me)
        self.unknown = 0
        self.known = 0