Python nltk.bigrams() Examples

The following are 19 code examples for showing how to use nltk.bigrams(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: BERT   Author: yyht   File: utils.py    License: Apache License 2.0 7 votes vote down vote up
def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 2
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 6 votes vote down vote up
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
    return features

#////////////////////////////////////////////////////////////
#{ Helper Functions
#//////////////////////////////////////////////////////////// 
Example 3
Project: BERT   Author: yyht   File: loader.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, data_paths, batch_size, unroll, level):
    self.batch_size = batch_size
    self.unroll = unroll
    train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data(
        data_paths, level)
    self.bg_counts = bigram_counts(train_tokens)
    self.tg_counts = trigram_counts(train_tokens)
    self.token_to_id = token_to_id
    # NOTE extends the vocabulary
    self.token_to_id['<_>'] = len(self.token_to_id)
    self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems())
    train_data = _reshape_data(train_data, batch_size, unroll)
    valid_data = _reshape_data(valid_data, batch_size, unroll)
    test_data = _reshape_data(test_data, batch_size, unroll)
    self.split_data = {"train": train_data, "valid": valid_data,
                       "test": test_data}
    self.frequencies = frequencies
    self.frequencies_cumsum = np.cumsum(frequencies)
    self.hist_freqs = hist_freqs
    self.hist_freqs_cumsum = np.cumsum(hist_freqs)
    self.continuations = build_continuations(self.bg_counts)
    bgs = nltk.bigrams(train_tokens)
    if level == "word":
      self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts(
          bgs) 
Example 4
Project: allennlp-semparse   Author: allenai   File: atis_world.py    License: Apache License 2.0 6 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
Example 5
Project: twitter-sentiment-analysis   Author: abdulfatir   File: maxent-nltk.py    License: MIT License 6 votes vote down vote up
def get_data_from_file(file_name, isTrain=True):
    data = []
    with open(train_csv_file, 'r') as csv:
        lines = csv.readlines()
        total = len(lines)
        for i, line in enumerate(lines):
            if isTrain:
                tag = line.split(',')[1]
                bag_of_words = line.split(',')[2].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            else :
                tag = '5'
                bag_of_words = line.split(',')[1].split()
                if USE_BIGRAMS:
                    bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split()))
                    bag_of_words = bag_of_words+bag_of_words_bigram
            data.append((bag_of_words, tag))
    return data 
Example 6
Project: qb   Author: Pinafore   File: dataset.py    License: MIT License 5 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
Example 7
Project: DeepLearn   Author: GauravBh1010tt   File: lex_sem_ft.py    License: MIT License 5 votes vote down vote up
def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]: 
Example 8
Project: BERT   Author: yyht   File: utils.py    License: Apache License 2.0 5 votes vote down vote up
def estimate_modkn_discounts(ngrams):
	# Get counts
	counts = Counter(ngrams)
	N1 = float(len([k for k in counts if counts[k] == 1]))
	N2 = float(len([k for k in counts if counts[k] == 2]))
	N3 = float(len([k for k in counts if counts[k] == 3]))
	N4 = float(len([k for k in counts if counts[k] == 4]))
	N3p = float(len([k for k in counts if counts[k] >= 3]))

	# Estimate discounting parameters
	Y = N1 / (N1 + 2 * N2)
	D1 = 1 - 2 * Y * (N2 / N1)
	D2 = 2 - 3 * Y * (N3 / N2)
	D3p = 3 - 4 * Y * (N4 / N3)

	# FIXME(zxie) Assumes bigrams for now
	# Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+)
	N1_lookup = Counter()
	N2_lookup = Counter()
	N3p_lookup = Counter()
	for bg in counts:
		if counts[bg] == 1:
			N1_lookup[bg[0]] += 1
		elif counts[bg] == 2:
			N2_lookup[bg[0]] += 1
		else:
			N3p_lookup[bg[0]] += 1

	return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup 
Example 9
Project: kaggle-HomeDepot   Author: ChenglongChen   File: spelling_checker.py    License: MIT License 5 votes vote down vote up
def get_valid_bigram_words(self, words):
        _words = []
        for i in nltk.bigrams(words):
            if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
                if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
                    if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
                        _words.append(" ".join(i))
        return _words 
Example 10
Project: DL-text   Author: GauravBh1010tt   File: lex_sem_ft.py    License: MIT License 5 votes vote down vote up
def train_bigram(lst):
    model = defaultdict(lambda: defaultdict(lambda: 0))

    for sent in lst:
        sent = sent.split()
        for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
            model[w1][w2] += 1  
    total_count = 0      
    for w1 in model:
        total_count = float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2] /= total_count
    return model

#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]: 
Example 11
Project: textkit   Author: learntextvis   File: bigrams.py    License: MIT License 5 votes vote down vote up
def words2bigrams(sep, tokens):
    '''Tokenize words into bigrams. Bigrams are two word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    bigrams = []
    try:
        bigrams = list(nltk.bigrams(content))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(sep.join(bigram)) for bigram in bigrams] 
Example 12
Project: atap   Author: foxbook   File: sc_bigramcount.py    License: Apache License 2.0 5 votes vote down vote up
def count_bigrams(corpus):
    text = corpus.map(itemgetter(1))
    sents = text.flatMap(nltk.sent_tokenize)
    sents = sents.map(lambda s: list(nltk.word_tokenize(s)))

    bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s)))
    unique_bigrams = bigrams.distinct().count()
    print("unique bigrams: {}".format(unique_bigrams))

    bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF()
    print(bigram_counts.head())


## Main functionality 
Example 13
Project: codenn   Author: sriniiyer   File: SVM.py    License: MIT License 5 votes vote down vote up
def tokenize(text):
      # text = NB.remove_punctuation(text)
      try:
        text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
      except:
        text = text.encode('ascii', 'replace').strip().lower()
      word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)]   # split punctuations but dont split single quotes for words like don't
      biword =  [b for b in nltk.bigrams(word)]
      triword =  [t for t in nltk.trigrams(word)]
      # word = [w for w in word if w not in stopwords.words('english')]
      return  word # triword 
Example 14
Project: CrisisLex   Author: sajao   File: adaptive_collect.py    License: MIT License 5 votes vote down vote up
def update_terms_stats(terms_fd, json_tweet, lex):
    tweet = utils.extract_tweet_from_json(json_tweet)
    tweet_terms = []
    if tweet is None:
        return False
    tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
    doc = tokenizer.tokenize(tweet)
    for w_raw in doc:
        w = w_raw.strip('\"\'.,;?!:)(@/*&')
        if not (w.strip('#')).isalpha():
            w_aux = ''
            #ignore non-ascii characters
            for s in w:
                if ord(s) < 128:
                    w_aux += s
                else:
                    break
            w = w_aux
        w = w.lower()
        if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
            if w in lex:
                continue
            tweet_terms.append(w)
            terms_fd.inc(w)
    bigrams = nltk.bigrams(tweet_terms)
    for b in bigrams:
        if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex:
            continue
        if b[1]+" "+b[0] in terms_fd:
            terms_fd.inc(b[1]+" "+b[0])
        else:
            terms_fd.inc(b[0]+" "+b[1])
    return True 
Example 15
Project: CrisisLex   Author: sajao   File: read.py    License: MIT License 5 votes vote down vote up
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
    ps = PorterStemmer()
    local_map = dict()
    word_list = []

    clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
    filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]

    for w in filtered_words:
        if w.isalpha():
            w_temp = ps.stem_word(w)
            if stem_words_map is not None:
                if w_temp not in stem_words_map:
                    stem_words_map[w_temp] = dict()
                stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
                local_map[w_temp] = w
            word_list.append(w_temp)

    bigrams = nltk.bigrams(word_list)
    for b in bigrams:
        bigram_org = (local_map[b[0]],local_map[b[1]])
        if stem_bigrams_map is not None:
                if b not in stem_bigrams_map:
                    stem_bigrams_map[b] = dict()
                stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1

    return word_list, bigrams

# keeps track of the exact form of the stemmed bigrams, not only the one of the words 
Example 16
Project: CrisisLex   Author: sajao   File: read.py    License: MIT License 5 votes vote down vote up
def get_tweet_terms(tweet, stem_map = None, bigrams_map = None):
    words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map)
    filtered_words = [w for w in words if not w in stopwords.words('english')]

    bigrams = nltk.bigrams(filtered_words)
    words_set = set(filtered_words)
    terms_dict = {}

    for w in words_set:
        terms_dict['%s'%w] = 'y'

    for b in bigrams:
        terms_dict['%s %s'%(b[0],b[1])] = 'y'

    return terms_dict 
Example 17
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: util.py    License: Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def extract_bigram_feats(document, bigrams):
    """
    Populate a dictionary of bigram features, reflecting the presence/absence in
    the document of each of the tokens in `bigrams`. This extractor function only
    considers contiguous bigrams obtained by `nltk.bigrams`.

    :param document: a list of words/tokens.
    :param unigrams: a list of bigrams whose presence/absence has to be
        checked in `document`.
    :return: a dictionary of bigram features {bigram : boolean}.

    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
    >>> document = 'ice is melting due to global warming'.split()
    >>> sorted(extract_bigram_feats(document, bigrams).items())
    [('contains(global - warming)', True), ('contains(love - you)', False),
    ('contains(police - prevented)', False)]
    """
    features = {}
    for bigr in bigrams:
        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
            document
        )
    return features


# ////////////////////////////////////////////////////////////
# { Helper Functions
# //////////////////////////////////////////////////////////// 
Example 18
Project: BREDS   Author: davidsbatista   File: large-scale-evaluation-freebase.py    License: GNU Lesser General Public License v3.0 5 votes vote down vote up
def extract_bigrams(text):
    tokens = word_tokenize(text)
    return [gram[0]+' '+gram[1] for gram in bigrams(tokens)]


# ########################################
# Estimations of sets and intersections #
# ######################################## 
Example 19
Project: allennlp-semparse   Author: allenai   File: atis_world.py    License: Apache License 2.0 4 votes vote down vote up
def add_dates_to_number_linking_scores(
        self,
        number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
        current_tokenized_utterance: List[Token],
    ) -> None:

        month_reverse_lookup = {
            str(number): string for string, number in atis_tables.MONTH_NUMBERS.items()
        }
        day_reverse_lookup = {
            str(number): string for string, number in atis_tables.DAY_NUMBERS.items()
        }

        if self.dates:
            for date in self.dates:
                # Add the year linking score
                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == str(date.year):
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="year_number",
                    right_hand_side=str(date.year),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("year_number", str(date.year), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == month_reverse_lookup[str(date.month)]:
                        entity_linking[token_index] = 1
                action = format_action(
                    nonterminal="month_number",
                    right_hand_side=str(date.month),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )

                number_linking_scores[action] = ("month_number", str(date.month), entity_linking)

                entity_linking = [0 for token in current_tokenized_utterance]
                for token_index, token in enumerate(current_tokenized_utterance):
                    if token.text == day_reverse_lookup[str(date.day)]:
                        entity_linking[token_index] = 1
                for bigram_index, bigram in enumerate(
                    bigrams([token.text for token in current_tokenized_utterance])
                ):
                    if " ".join(bigram) == day_reverse_lookup[str(date.day)]:
                        entity_linking[bigram_index] = 1
                        entity_linking[bigram_index + 1] = 1
                action = format_action(
                    nonterminal="day_number",
                    right_hand_side=str(date.day),
                    is_number=True,
                    keywords_to_uppercase=KEYWORDS,
                )
                number_linking_scores[action] = ("day_number", str(date.day), entity_linking)