Python nltk.trigrams() Examples

The following are code examples for showing how to use nltk.trigrams(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: SyntaViz   Author: Comcast   File: filter_query.py    Apache License 2.0 6 votes vote down vote up
def trigram_freqdist(inp='../data/combined_corpus', outp='../data/fdist_kn.pickle'):
    """
    It calculates the trigram frequency distributions for the 
    parliament speech dataset. This distribution is important
    for calculating the trigram probabilities with kneser-ney 
    smoothing. The distribution is saved in a pickle file.
    """
    with open(inp) as f:
        alltrigrams = []
        for i, aline in enumerate(f):
            aline = aline.strip().decode('utf8')
            aline = aline.encode('ascii', 'ignore')
            aline = aline.lower()
            tokens = ['<s>'] + aline.split() + ['<e>']
            alltrigrams += [(x, y, z) for x, y, z in nltk.trigrams(tokens)]
            if i % 10000 == 0:
                print(i)
        fdist = nltk.FreqDist(alltrigrams)
        cp.dump({'fdist': fdist}, open(outp, 'wb')) 
Example 2
Project: qb   Author: Pinafore   File: dataset.py    MIT License 5 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
Example 3
Project: DeepLearn   Author: GauravBh1010tt   File: lex_sem_ft.py    MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example 4
Project: BERT   Author: yyht   File: utils.py    Apache License 2.0 5 votes vote down vote up
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 5
Project: DL-text   Author: GauravBh1010tt   File: lex_sem_ft.py    MIT License 5 votes vote down vote up
def train_trigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))
    for sent in lst:
        sent = sent.split()
        for w1, w2, w3 in trigrams(sent, pad_right=True, pad_left=True):
            model[(w1,w2)][w2] += 1
    total_count = 0
    for w1,w2 in model:
        total_count = float(sum(model[(w1, w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1, w2)][w3] /= total_count

#Total Sum Of Trigram Probablity Of A Sentence[Returns Float]: 
Example 6
Project: aueb.twitter.sentiment   Author: nlpaueb   File: utilities.py    GNU General Public License v3.0 5 votes vote down vote up
def getBigrams(l):
    b = []
    for x in l:
        b.append(list(bigrams(x)))

    return b

#calculate trigrams of every item of the list l 
Example 7
Project: aueb.twitter.sentiment   Author: nlpaueb   File: utilities.py    GNU General Public License v3.0 5 votes vote down vote up
def getTrigrams(l):
    tr = []
    for x in l:
        tr.append(list(trigrams(x)))

    return tr

#calculate pos tag score 
Example 8
Project: aueb.twitter.sentiment   Author: nlpaueb   File: utilities.py    GNU General Public License v3.0 5 votes vote down vote up
def posTrigramsScore(trigrams,category,pos_tags_trigrams,labels):
    
    #keep pos tags bigrams of specific category
    trigrams_category = subList(pos_tags_trigrams,labels,category)

    #initialize dictionary
    d = {}

    #calculate score for every bigram
    for trigram in trigrams:
        d[trigram] = score(trigram,category,trigrams_category,pos_tags_trigrams)

    return d

#calculate bigram's f1 score 
Example 9
Project: RealtimeSentimentAnalysis   Author: zHaytam   File: models.py    MIT License 5 votes vote down vote up
def create_features(text, handle_negation=False):
    words = extract_words(text, handle_negation=handle_negation)
    bigrams = ['_'.join(bigram) for bigram in list(nltk.bigrams(words))]
    trigrams = ['_'.join(trigram) for trigram in list(nltk.trigrams(words))]
    features = {}

    for word in most_used_words:
        features[word] = True if word in words else False
    for bigram in most_used_bigrams:
        features[bigram] = True if bigram in bigrams else False
    for trigram in most_used_trigrams:
        features[trigram] = True if trigram in trigrams else False

    features['compound'] = sia.polarity_scores(text)['compound'] + 1
    return features 
Example 10
Project: codenn   Author: sriniiyer   File: SVM.py    MIT License 5 votes vote down vote up
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
Example 11
Project: Predicting-Cyberbulling-on-Twitter   Author: chantelmariediaz   File: Predicting+Cyberbullying+Twitter+ Code1.py    Apache License 2.0 5 votes vote down vote up
def bag_of_trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq, n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)  
    trigrams = trigram_finder.nbest(score_fn, n)  
    return bag_of_words(trigrams)


# In[64]:

#Final list for modeling 
Example 12
Project: Predicting-Cyberbulling-on-Twitter   Author: chantelmariediaz   File: Predicting+Cyberbullying+Twitter+ Code1.py    Apache License 2.0 5 votes vote down vote up
def trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq,
n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    return trigrams

#Combined 
Example 13
Project: Predicting-Cyberbulling-on-Twitter   Author: chantelmariediaz   File: Predicting+Cyberbullying+Code+2.py    Apache License 2.0 5 votes vote down vote up
def bag_of_trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq, n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)  
    trigrams = trigram_finder.nbest(score_fn, n)  
    return bag_of_words(trigrams)


# In[27]: 
Example 14
Project: Predicting-Cyberbulling-on-Twitter   Author: chantelmariediaz   File: Predicting+Cyberbullying+Code+2.py    Apache License 2.0 5 votes vote down vote up
def trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq,
n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    return trigrams

#bag of ngrams 
Example 15
Project: Predicting-Cyberbulling-on-Twitter   Author: chantelmariediaz   File: Predicting+Cyberbullying+Code+2.py    Apache License 2.0 5 votes vote down vote up
def trigrams_words(words, score_fn=TrigramAssocMeasures.chi_sq,
n=200):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    return trigrams 
Example 16
Project: python-scripts   Author: Ventrosky   File: totd-project.py    GNU General Public License v3.0 5 votes vote down vote up
def getTrigrams(tokens):
    trigrams1 = list(trigrams(tokens))
    return trigrams1

# 100 token più frequenti 
Example 17
Project: python-scripts   Author: Ventrosky   File: tw-tool.py    GNU General Public License v3.0 5 votes vote down vote up
def usage():
    print '-'*80
    print "Texts and Words Tool"
    print
    print "Usage: tw-tool.py -t text_file1"
    print "-m --max_freq=char_num   - find the token (len >= char_num) with maximum frequency"
    print "-x --hapax               - print hapax and their distribution"
    print "-e --encoding=codec      - select character sets, default 'utf-8'"
    print "-n --normalize           - remove curly quotes"
    print "-v --vocabolary          - informations on type words"
    print "-g --growth=step         - growth of statistical indexes, inc text by [step]"
    print "-b --bigrams_infos       - bigrams infos: F(u,v), F(u), F(v), P(v|u), P(u,v)"
    print "-i --mutual-information  - print the Mutual Information of each bigram"
    print "-r --trigrams_infos      - top 50 frequent trigrams"
    print
    print
    print "Examples: "
    print "tw-tool.py -t alice.txt -n -m 4 -x"
    print "tw-tool.py -t 1984.txt -e ascii -n -v"
    print "tw-tool.py -t hacethi.txt -e ascii -g 50"
    print "tw-tool.py -t rome.txt -b -i -r"
    print ""
    print '-'*80
    sys.exit(0)

#  "normalize" spaces, hyphens, quotation marks # TO DO 
Example 18
Project: python-scripts   Author: Ventrosky   File: project2.py    GNU General Public License v3.0 5 votes vote down vote up
def getTrigrams(tokens):
    trigrams1 = list(trigrams(tokens))
    return trigrams1

# 20 token più frequenti 
Example 19
Project: python-scripts   Author: Ventrosky   File: ngram.py    GNU General Public License v3.0 5 votes vote down vote up
def makeTrigramFreq(tokens):
    tgs = filterNgrams(list(nltk.trigrams(tokens)))
    a,b,c = list(zip(*tgs))
    bgs = list(zip(a,b))
    cfd = nltk.ConditionalFreqDist(list(zip(bgs, c)))
    with open('freqTrigrams.json', 'w') as f:
        json.dump(remapKeys(cfd),f)
    return cfd 
Example 20
Project: SyntaViz   Author: Comcast   File: filter_query.py    Apache License 2.0 4 votes vote down vote up
def kn_logprob(inp='../data/vrex_1week_long_text.queries',
               outp='../data/vrex_1week_with_probability.queries',
               fdfile='../data/fdist_kn.pickle',
               minlen=4,
               length_normalized=True):
    """
    Calculates the log probability of every query from the input file according 
    to the trigram distributions. It uses Kneser Ney smoothing.
    It produces a tab delimited file with the queries and the logprobabilities.
    :params fdfile: Trigram frequency distribution file (pickled)
    """
    print('Loading Trigram Distribution')
    fdist = cp.load(open(fdfile))['fdist']
    print('Trigram Distribution Loaded')
    kn_pd = nltk.probability.KneserNeyProbDist(fdist)
    print('Kneser Ney Loaded')
    with open(inp) as f:
        with open(outp, 'wb') as fout:
            for i, aline in enumerate(f):
                jdat = json.loads(aline.strip())
                q = jdat['text'].lower().encode('ascii', 'ignore')
                tokens = ['<s>'] + nltk.word_tokenize(q) + ['<e>']
                if len(tokens) < minlen + 2:
                    continue
                logplist = []
                for x, y, z in nltk.trigrams(tokens):
                    lgp = kn_pd.logprob((x, y, z))
                    # OOV cases
                    if lgp == -1e300:
                        logplist.append(-50)
                    else:
                        logplist.append(lgp)
                # Length Normalization: Add points for longer sentences
                if length_normalized:
                    len_score = len(set(tokens)) * 8.5
                else:
                    len_score = 0

                logpsum = sum(logplist) + len_score
                fout.write(q + '\t' + str(logpsum) + '\n')
                fout.flush()
                if i % 100000 == 0:
                    print(i)