Python nltk.bigrams() Examples
The following are 19 code examples for showing how to use nltk.bigrams(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: BERT Author: yyht File: utils.py License: Apache License 2.0 | 7 votes |
def bigram_counts(word_list): bgs = nltk.bigrams(word_list) fdist = nltk.FreqDist(bgs) d = Counter() for k, v in fdist.items(): d[k] = v return d
Example 2
Project: razzy-spinner Author: rafasashi File: util.py License: GNU General Public License v3.0 | 6 votes |
def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document) return features #//////////////////////////////////////////////////////////// #{ Helper Functions #////////////////////////////////////////////////////////////
Example 3
Project: BERT Author: yyht File: loader.py License: Apache License 2.0 | 6 votes |
def __init__(self, data_paths, batch_size, unroll, level): self.batch_size = batch_size self.unroll = unroll train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data( data_paths, level) self.bg_counts = bigram_counts(train_tokens) self.tg_counts = trigram_counts(train_tokens) self.token_to_id = token_to_id # NOTE extends the vocabulary self.token_to_id['<_>'] = len(self.token_to_id) self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems()) train_data = _reshape_data(train_data, batch_size, unroll) valid_data = _reshape_data(valid_data, batch_size, unroll) test_data = _reshape_data(test_data, batch_size, unroll) self.split_data = {"train": train_data, "valid": valid_data, "test": test_data} self.frequencies = frequencies self.frequencies_cumsum = np.cumsum(frequencies) self.hist_freqs = hist_freqs self.hist_freqs_cumsum = np.cumsum(hist_freqs) self.continuations = build_continuations(self.bg_counts) bgs = nltk.bigrams(train_tokens) if level == "word": self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts( bgs)
Example 4
Project: allennlp-semparse Author: allenai File: atis_world.py License: Apache License 2.0 | 6 votes |
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]: """ Based on the current utterance, return a dictionary where the keys are the strings in the database that map to lists of the token indices that they are linked to. """ string_linking_scores: Dict[str, List[int]] = defaultdict(list) for index, token in enumerate(tokenized_utterance): for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []): string_linking_scores[string].append(index) token_bigrams = bigrams([token.text for token in tokenized_utterance]) for index, token_bigram in enumerate(token_bigrams): for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []): string_linking_scores[string].extend([index, index + 1]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for index, trigram in enumerate(trigrams): if trigram[0] == "st": natural_language_key = f"st. {trigram[2]}".lower() else: natural_language_key = " ".join(trigram).lower() for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []): string_linking_scores[string].extend([index, index + 1, index + 2]) return string_linking_scores
Example 5
Project: twitter-sentiment-analysis Author: abdulfatir File: maxent-nltk.py License: MIT License | 6 votes |
def get_data_from_file(file_name, isTrain=True): data = [] with open(train_csv_file, 'r') as csv: lines = csv.readlines() total = len(lines) for i, line in enumerate(lines): if isTrain: tag = line.split(',')[1] bag_of_words = line.split(',')[2].split() if USE_BIGRAMS: bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split())) bag_of_words = bag_of_words+bag_of_words_bigram else : tag = '5' bag_of_words = line.split(',')[1].split() if USE_BIGRAMS: bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split())) bag_of_words = bag_of_words+bag_of_words_bigram data.append((bag_of_words, tag)) return data
Example 6
Project: qb Author: Pinafore File: dataset.py License: MIT License | 5 votes |
def create_qb_tokenizer( unigrams=True, bigrams=False, trigrams=False, zero_length_token='zerolengthunk', strip_qb_patterns=True): def tokenizer(text): if strip_qb_patterns: text = re.sub( '\s+', ' ', re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE) ).strip().capitalize() import nltk tokens = nltk.word_tokenize(text) if len(tokens) == 0: return [zero_length_token] else: ngrams = [] if unigrams: ngrams.extend(tokens) if bigrams: ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)]) if trigrams: ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)]) if len(ngrams) == 0: ngrams.append(zero_length_token) return ngrams return tokenizer
Example 7
Project: DeepLearn Author: GauravBh1010tt File: lex_sem_ft.py License: MIT License | 5 votes |
def train_bigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2 in bigrams(sent, pad_right=True, pad_left=True): model[w1][w2] += 1 total_count = 0 for w1 in model: total_count = float(sum(model[w1].values())) for w2 in model[w1]: model[w1][w2] /= total_count return model #Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:
Example 8
Project: BERT Author: yyht File: utils.py License: Apache License 2.0 | 5 votes |
def estimate_modkn_discounts(ngrams): # Get counts counts = Counter(ngrams) N1 = float(len([k for k in counts if counts[k] == 1])) N2 = float(len([k for k in counts if counts[k] == 2])) N3 = float(len([k for k in counts if counts[k] == 3])) N4 = float(len([k for k in counts if counts[k] == 4])) N3p = float(len([k for k in counts if counts[k] >= 3])) # Estimate discounting parameters Y = N1 / (N1 + 2 * N2) D1 = 1 - 2 * Y * (N2 / N1) D2 = 2 - 3 * Y * (N3 / N2) D3p = 3 - 4 * Y * (N4 / N3) # FIXME(zxie) Assumes bigrams for now # Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+) N1_lookup = Counter() N2_lookup = Counter() N3p_lookup = Counter() for bg in counts: if counts[bg] == 1: N1_lookup[bg[0]] += 1 elif counts[bg] == 2: N2_lookup[bg[0]] += 1 else: N3p_lookup[bg[0]] += 1 return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup
Example 9
Project: kaggle-HomeDepot Author: ChenglongChen File: spelling_checker.py License: MIT License | 5 votes |
def get_valid_bigram_words(self, words): _words = [] for i in nltk.bigrams(words): if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len): if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)): if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)): _words.append(" ".join(i)) return _words
Example 10
Project: DL-text Author: GauravBh1010tt File: lex_sem_ft.py License: MIT License | 5 votes |
def train_bigram(lst): model = defaultdict(lambda: defaultdict(lambda: 0)) for sent in lst: sent = sent.split() for w1, w2 in bigrams(sent, pad_right=True, pad_left=True): model[w1][w2] += 1 total_count = 0 for w1 in model: total_count = float(sum(model[w1].values())) for w2 in model[w1]: model[w1][w2] /= total_count return model #Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:
Example 11
Project: textkit Author: learntextvis File: bigrams.py License: MIT License | 5 votes |
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
Example 12
Project: atap Author: foxbook File: sc_bigramcount.py License: Apache License 2.0 | 5 votes |
def count_bigrams(corpus): text = corpus.map(itemgetter(1)) sents = text.flatMap(nltk.sent_tokenize) sents = sents.map(lambda s: list(nltk.word_tokenize(s))) bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s))) unique_bigrams = bigrams.distinct().count() print("unique bigrams: {}".format(unique_bigrams)) bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF() print(bigram_counts.head()) ## Main functionality
Example 13
Project: codenn Author: sriniiyer File: SVM.py License: MIT License | 5 votes |
def tokenize(text): # text = NB.remove_punctuation(text) try: text = text.decode('utf-8').encode('ascii', 'replace').strip().lower() except: text = text.encode('ascii', 'replace').strip().lower() word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't biword = [b for b in nltk.bigrams(word)] triword = [t for t in nltk.trigrams(word)] # word = [w for w in word if w not in stopwords.words('english')] return word # triword
Example 14
Project: CrisisLex Author: sajao File: adaptive_collect.py License: MIT License | 5 votes |
def update_terms_stats(terms_fd, json_tweet, lex): tweet = utils.extract_tweet_from_json(json_tweet) tweet_terms = [] if tweet is None: return False tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+') doc = tokenizer.tokenize(tweet) for w_raw in doc: w = w_raw.strip('\"\'.,;?!:)(@/*&') if not (w.strip('#')).isalpha(): w_aux = '' #ignore non-ascii characters for s in w: if ord(s) < 128: w_aux += s else: break w = w_aux w = w.lower() if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16): if w in lex: continue tweet_terms.append(w) terms_fd.inc(w) bigrams = nltk.bigrams(tweet_terms) for b in bigrams: if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex: continue if b[1]+" "+b[0] in terms_fd: terms_fd.inc(b[1]+" "+b[0]) else: terms_fd.inc(b[0]+" "+b[1]) return True
Example 15
Project: CrisisLex Author: sajao File: read.py License: MIT License | 5 votes |
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None): ps = PorterStemmer() local_map = dict() word_list = [] clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)] filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')] for w in filtered_words: if w.isalpha(): w_temp = ps.stem_word(w) if stem_words_map is not None: if w_temp not in stem_words_map: stem_words_map[w_temp] = dict() stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1 local_map[w_temp] = w word_list.append(w_temp) bigrams = nltk.bigrams(word_list) for b in bigrams: bigram_org = (local_map[b[0]],local_map[b[1]]) if stem_bigrams_map is not None: if b not in stem_bigrams_map: stem_bigrams_map[b] = dict() stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1 return word_list, bigrams # keeps track of the exact form of the stemmed bigrams, not only the one of the words
Example 16
Project: CrisisLex Author: sajao File: read.py License: MIT License | 5 votes |
def get_tweet_terms(tweet, stem_map = None, bigrams_map = None): words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map) filtered_words = [w for w in words if not w in stopwords.words('english')] bigrams = nltk.bigrams(filtered_words) words_set = set(filtered_words) terms_dict = {} for w in words_set: terms_dict['%s'%w] = 'y' for b in bigrams: terms_dict['%s %s'%(b[0],b[1])] = 'y' return terms_dict
Example 17
Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: util.py License: Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams( document ) return features # //////////////////////////////////////////////////////////// # { Helper Functions # ////////////////////////////////////////////////////////////
Example 18
Project: BREDS Author: davidsbatista File: large-scale-evaluation-freebase.py License: GNU Lesser General Public License v3.0 | 5 votes |
def extract_bigrams(text): tokens = word_tokenize(text) return [gram[0]+' '+gram[1] for gram in bigrams(tokens)] # ######################################## # Estimations of sets and intersections # # ########################################
Example 19
Project: allennlp-semparse Author: allenai File: atis_world.py License: Apache License 2.0 | 4 votes |
def add_dates_to_number_linking_scores( self, number_linking_scores: Dict[str, Tuple[str, str, List[int]]], current_tokenized_utterance: List[Token], ) -> None: month_reverse_lookup = { str(number): string for string, number in atis_tables.MONTH_NUMBERS.items() } day_reverse_lookup = { str(number): string for string, number in atis_tables.DAY_NUMBERS.items() } if self.dates: for date in self.dates: # Add the year linking score entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == str(date.year): entity_linking[token_index] = 1 action = format_action( nonterminal="year_number", right_hand_side=str(date.year), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("year_number", str(date.year), entity_linking) entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == month_reverse_lookup[str(date.month)]: entity_linking[token_index] = 1 action = format_action( nonterminal="month_number", right_hand_side=str(date.month), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("month_number", str(date.month), entity_linking) entity_linking = [0 for token in current_tokenized_utterance] for token_index, token in enumerate(current_tokenized_utterance): if token.text == day_reverse_lookup[str(date.day)]: entity_linking[token_index] = 1 for bigram_index, bigram in enumerate( bigrams([token.text for token in current_tokenized_utterance]) ): if " ".join(bigram) == day_reverse_lookup[str(date.day)]: entity_linking[bigram_index] = 1 entity_linking[bigram_index + 1] = 1 action = format_action( nonterminal="day_number", right_hand_side=str(date.day), is_number=True, keywords_to_uppercase=KEYWORDS, ) number_linking_scores[action] = ("day_number", str(date.day), entity_linking)