Python nltk.ngrams() Examples
The following are 30
code examples of nltk.ngrams().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: data_helpers.py From coling2018_fake-news-challenge with Apache License 2.0 | 6 votes |
def extract_ngrams(text, stemmer, N): ''' Parameter Arguments: text: 'Ney York is a city. It has a huge population.' N: Length of the n-grams e.g. 1, 2 return: a list of n-grams [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' ngrams_list = [] ngram_items = list(ngrams(sent2stokens(text, stemmer), N)) for i, ngram in enumerate(ngram_items): ngram_str = ' '.join(ngram) ngrams_list.append(ngram_str) return ngrams_list
Example #2
Source File: word.py From flambe with MIT License | 6 votes |
def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if self.exclude_stopwords and self.stop_words: example = ' '.join([word for word in word_tokenize(example) if word not in self.stop_words]) if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)
Example #3
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def extract_ngrams2(sentences, stemmer, language, N=2): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] N: Length of the n-grams e.g. 1, 2 return: a list of n-grams [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' ngrams_list = [] for sent in sentences: sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N)) for i, ngram in enumerate(ngram_items): ngram_str = ' '.join(ngram) ngrams_list.append(ngram_str) return ngrams_list
Example #4
Source File: word.py From flambe with MIT License | 6 votes |
def __init__(self, ngrams: Union[int, List[int]] = 1, exclude_stopwords: bool = False, stop_words: Optional[List] = None) -> None: """ Initialize the NGramsTokenizer Parameters ---------- ngrams : Union[int, List[int]], optional [description], by default 1 exclude_stopwords: bool [description], by default False stop_words: Optional[List] [description], by default None """ self.ngrams = ngrams self.exclude_stopwords = exclude_stopwords if self.exclude_stopwords: self.stop_words = stop_words if self.stop_words is None: nltk.download('stopwords', quiet=True) self.stop_words = stopwords.words('english') nltk.download('punkt', quiet=True)
Example #5
Source File: experiments.py From clickbait with MIT License | 6 votes |
def n_gram_analysis_simple(infile, gram, stop): ngram = dict() f = open(infile, "r" ) #f2 = codecs.open(outfile, "w+", "utf-8") for l in f: x = nltk.ngrams(l.split(),gram) for w in x: # if stop: # if w not in stops: # if w in ngram: # ngram[w]+=1 # else: # ngram[w]=1 if w in ngram: ngram[w] += 1 else: ngram[w] = 1 p = list(ngram.items()) p.sort(key = lambda x: -x[1]) print len(p) for x in p[:10]: sen = ' '.join(x[0]) cnt = int(x[1]) if cnt == 0: cnt = 1 print sen, cnt
Example #6
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def extract_nuggets(sentences, nugget_type, language): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] return: a list of noun phrases, events, named_entities [('new', 'york'), ('york', 'is'), ('a', 'city'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' nugget_list = [] for sent in sentences: if nugget_type == 'n-grams': nugget_items = list(ngrams(sent2stokens(sent, language), 2)) if nugget_type == 'NP': nugget_items = get_phrases(sent, 'NP') if nugget_type == 'Phrases': nugget_items = get_phrases(sent, 'Phrases') if nugget_type == 'NE': nugget_items = get_phrases(sent, 'NE') for nugget in nugget_items: nugget_list.append(' '.join(nugget)) return nugget_list
Example #7
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_time_range_end_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: early_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "early" } time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): for time in TIME_RANGE_END_DICT.get(token.text, []): if token_index - 1 not in early_indices: time_range_end_linking_dict[str(time)].append(token_index) bigrams = ngrams([token.text for token in tokenized_utterance], 2) for bigram_index, bigram in enumerate(bigrams): for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []): time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1]) return time_range_end_linking_dict
Example #8
Source File: feedback_graph.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def add_sentences(self, sentences): """ @type sentences: list[Sentence] """ counter = self.counter G = self.G for sent in sentences: counter.update(ngrams(sent.tokens, self.N)) G.add_nodes_from(sent.tokens) updated_edges = [] for v in counter.elements(): s = v[0] t = v[1] c = counter[v] updated_edges.append((s, t, c)) G.add_weighted_edges_from(updated_edges)
Example #9
Source File: compute.py From ALaCarte with MIT License | 6 votes |
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None): '''sliding window around n-grams in a document Args: strdoc: list of tokens (as strings) intdoc: list of indices (as ints); len(intdoc) == len(strdoc) vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys) n: n in n-gram wndo2: half the window size unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams Returns: (n-gram, int generator) generator over (n-gram, context window pairs) ''' wndo2pn = wndo2+n unk = not unkgram is None for i, ngram in enumerate(nltk.ngrams(strdoc, n)): if ngram in vocabulary: yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) elif unk: yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
Example #10
Source File: ngram.py From ALaCarte with MIT License | 6 votes |
def alabong(A, word_embeddings, lists, coocs, counts): n = len(lists) def represent(documents): output = [] docs = tokenize(doc.lower() for doc in documents) for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts): kgrams = [list(nltk.ngrams(doc, k)) for doc in docs] vocab = {kgram for doc in kgrams for kgram in doc} where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]]) bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc') output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT)) for offset in range(0, where.shape[0], MAXSLICE): indices = where[offset:offset+MAXSLICE] if k > 1: vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k else: vecs = normalize(word_embeddings[indices]) output[-1] += bong[:,indices].dot(vecs) return np.hstack(output) return represent, None, True
Example #11
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]: """ Based on the current utterance, return a dictionary where the keys are the strings in the database that map to lists of the token indices that they are linked to. """ string_linking_scores: Dict[str, List[int]] = defaultdict(list) for index, token in enumerate(tokenized_utterance): for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []): string_linking_scores[string].append(index) token_bigrams = bigrams([token.text for token in tokenized_utterance]) for index, token_bigram in enumerate(token_bigrams): for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []): string_linking_scores[string].extend([index, index + 1]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for index, trigram in enumerate(trigrams): if trigram[0] == "st": natural_language_key = f"st. {trigram[2]}".lower() else: natural_language_key = " ".join(trigram).lower() for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []): string_linking_scores[string].extend([index, index + 1, index + 2]) return string_linking_scores
Example #12
Source File: mention_extraction.py From starsem2018-entity-linking with Apache License 2.0 | 6 votes |
def parse(self, tagged_text, ngram_len=-1): ngrams = [] if len(tagged_text) == 0: return ngrams if tagged_text[0]['pos'] in self._exclude_if_first: tagged_text = tagged_text[1:] if ngram_len == -1: for l in range(len(tagged_text), 0, -1): ngrams += list(nltk.ngrams(tagged_text, l)) else: ngrams += list(nltk.ngrams(tagged_text, ngram_len)) ngrams += [n[:-1] for n in ngrams if len(n) > 1 and n[-1]['pos'] in {"NN", "NNS"}] ngrams += [n[1:] for n in ngrams if len(n) > 1 and n[0]['pos'] in {"NN", "NNS"}] ngrams = [n for n in ngrams if len({el[i] for el in n for i in {'pos', 'ner'}} & self._exclude_pos) == 0 and (len(n) == 1 or (n[0]['pos'] not in self._exclude_prefix and n[0]['word'].lower() not in utils.stop_words_en and n[-1]['pos'] not in self._exclude_suffix and n[-1]['word'].lower() not in utils.stop_words_en) ) and not(len(n) == 1 and (n[0]['pos'] in self._exclude_alone or n[0]['word'].lower() in utils.stop_words_en))] return ngrams
Example #13
Source File: lang_model_2.py From jakaton_feminicidios with MIT License | 6 votes |
def __init__(self, order, alpha, sentences): self.order = order self.alpha = alpha if order > 1: self.backoff = LangModel(order - 1, alpha, sentences) self.lexicon = None else: self.backoff = None self.n = 0 self.ngramFD = nltk.FreqDist() lexicon = set() for sentence in sentences: words = nltk.word_tokenize(sentence) wordNGrams = nltk.ngrams(words, order) for wordNGram in wordNGrams: self.ngramFD[wordNGram] += 1 # self.ngramFD.inc(wordNGram) if order == 1: lexicon.add(wordNGram) self.n += 1 self.v = len(lexicon)
Example #14
Source File: ngrams.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8", tokenize_method=nltk.word_tokenize): """ Get distribution of skipgrams with given n and k values from input_buffer. :param input_buffer: :param n: :param k: :param encoding: :param tokenize_method: :return: """ # Ensure we have a decoded string if isinstance(input_buffer, bytes): input_buffer = input_buffer.decode(encoding) ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n) return nltk.util.skipgrams(ngrams, n, k)
Example #15
Source File: eval.py From SARC with MIT License | 6 votes |
def parse(): parser = argparse.ArgumentParser() parser.add_argument('dataset', help='pol or main', type=str) parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int) parser.add_argument('--min_count', default=1, help='Min count', type=int) parser.add_argument('--embedding', default=CCGLOVE, help='embedding file', type=str) parser.add_argument('--weights', default=None, help='weights to use for ngrams (e.g. sif, None)', type=str) parser.add_argument('-norm', '--normalize', action='store_true', help='Normalize vectors') parser.add_argument('-l', '--lower', action='store_true', help='Whether or not to lowercase text') parser.add_argument('-e', '--embed', action='store_true', help='Use embeddings instead of bong') return parser.parse_args()
Example #16
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_time_range_start_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: late_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "late" } time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): for time in TIME_RANGE_START_DICT.get(token.text, []): if token_index - 1 not in late_indices: time_range_start_linking_dict[str(time)].append(token_index) bigrams = ngrams([token.text for token in tokenized_utterance], 2) for bigram_index, bigram in enumerate(bigrams): for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []): time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1]) return time_range_start_linking_dict
Example #17
Source File: utils.py From Transferable-E2E-ABSA with MIT License | 5 votes |
def set_wid(dataset, vocab, win=1): """ set wid field for the dataset :param dataset: dataset :param vocab: vocabulary :param win: context window size, for window-based input, should be an odd number :return: dataset with field wid """ n_records = len(dataset) for i in range(n_records): words = dataset[i]['words'] lm_labels = [] # set labels for the auxiliary language modeling task for w in words: lm_labels.append(vocab[w]) dataset[i]['lm_labels'] = list(lm_labels) n_padded_words = win // 2 pad_left = ['PADDING' for _ in range(n_padded_words)] pad_right = ['PADDING' for _ in range(n_padded_words)] padded_words = pad_left + words + pad_right # the window-based input win_input = list(ngrams(padded_words, win)) assert len(win_input) == len(words) n_grams = [] for t in win_input: n_grams.append(t) wids = [[vocab[w] for w in ngram] for ngram in n_grams] dataset[i]['wids'] = list(wids) return dataset
Example #18
Source File: extract_statistical_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False): if len(n) < 1: return {} if not for_semantics: if stem: porter = PorterStemmer() tokens = [porter.stem(t.lower()) for t in tokens] if use_just_words: tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#') and t not in string.punctuation] ngram_tokens = [] for i in n: for gram in ngrams(tokens, i): string_token = 'gram ' for j in range(i): string_token += gram[j] + ' ' ngram_tokens.append(string_token) ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)} return ngram_features # Get sentiment features -- a total of 18 features derived # Emoji features: a count of the positive, negative and neutral emojis # along with the ratio of positive to negative emojis and negative to neutral # Using the MPQA subjectivity lexicon, we have to check words for their part of speech # and obtain features: a count of positive, negative and neutral words, as well as # a count of the strong and weak subjectives, along with their ratios and a total sentiment words. # Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)
Example #19
Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0 | 5 votes |
def ngrams(input, n): input = input.split(' ') output = [] for i in range(len(input) - n + 1): output.append(input[i:i + n]) return output
Example #20
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def prune_ngrams(ngrams, stoplist, N=2): pruned_list = [] for ngram in ngrams: items = ngram.split(' ') i = 0 for item in items: if item in stoplist: i += 1 if i < N: pruned_list.append(ngram) return pruned_list
Example #21
Source File: preprocessing.py From tensorflow-nlp-examples with MIT License | 5 votes |
def generate_ngrams(self, words, n=7): res = [] seqlen = len(words) for i in range(1, n + 1): for ngram in ngrams(range(seqlen), i): l, r = ngram[0], ngram[-1] + 1 res.append((l, r)) return res
Example #22
Source File: distinct.py From cotk with Apache License 2.0 | 5 votes |
def close(self): ''' Returns: (dict): Return a dict which contains * **bleu**: bleu value. * **bleu hashvalue**: hash value for bleu metric, same hash value stands for same evaluation settings. ''' result = super().close() if not self.hyps: raise RuntimeError("The metric has not been forwarded data correctly.") if self.sample > len(self.hyps): sample = len(self.hyps) else: sample = self.sample self._hash_ordered_data(sample) rng_state = random.getstate() random.seed(self.seed) random.shuffle(self.hyps) random.setstate(rng_state) self.hyps = self.hyps[:sample] if self.tokenizer: self._do_tokenize() if "unk" in self.dataloader.get_special_tokens_mapping(): self.hyps = replace_unk(self.hyps, unk = self.dataloader.get_special_tokens_mapping().get("unk", None)) ngram_list = list(chain(*[ngrams(sentence, self.ngram, pad_left=True, pad_right=True) for sentence in self.hyps])) ngram_set = set(ngram_list) result.update({"distinct": len(ngram_set) / len(ngram_list), \ "distinct hashvalue": self._hashvalue()}) return result
Example #23
Source File: word.py From flambe with MIT License | 5 votes |
def _tokenize(example: str, n: int) -> List[str]: """Tokenize an input example using ngrams. """ return list(" ".join(x) if len(x) > 1 else x[0] for x in ngrams(word_tokenize(example), n))
Example #24
Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_ngram_list(tknzr, text, n): tokens = tknzr.tokenize(text) tokens = [t for t in tokens if not t.startswith('#')] tokens = [t for t in tokens if not t.startswith('@')] ngram_list = [gram for gram in ngrams(tokens, n)] return ngram_list
Example #25
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 5 votes |
def _create_hash_features(self, df): def get_word_ngrams(sequence, n=3): return [' '.join(ngram) for ngram in ngrams(sequence, n)] def get_character_ngrams(sequence, n=3): sequence = ' '.join(sequence) return [sequence[i:i+n] for i in range(len(sequence)-n+1)] def calculate_simhash_distance(sequence1, sequence2): return Simhash(sequence1).distance(Simhash(sequence2)) def calculate_all_simhash(row): q1, q2 = row['splited_spn_1'], row['splited_spn_2'] simhash_distance = calculate_simhash_distance(q1, q2) q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2) simhash_distance_2gram = calculate_simhash_distance(q1, q2) q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3) simhash_distance_3gram = calculate_simhash_distance(q1, q2) q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2) simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2) q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3) simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2) return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram, simhash_distance_ch_2gram, simhash_distance_ch_3gram) df['sim_hash'] = df.apply(calculate_all_simhash, axis=1, raw=True) df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0])) df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1])) df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2])) df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3])) df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))
Example #26
Source File: distinct_metric.py From DiPS with Apache License 2.0 | 5 votes |
def ngram_toks(sents, n=1): ntoks =[] for sent in sents: ntok = list(ngrams(sent.split(), n)) newtoks = [tok for tok in ntok] ntoks+= newtoks return ntoks
Example #27
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def extract_ngrams(sentences, stoplist, stemmer, language, n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ concepts = [] for i, sentence in enumerate(sentences): # for each ngram of words tokens = sent2tokens(sentence, language) for j in range(len(tokens)-(n-1)): # initialize ngram container ngram = [] # for each token of the ngram for k in range(j, j+n): ngram.append(tokens[k].lower()) # do not consider ngrams containing punctuation marks marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)] if len(marks) > 0: continue # do not consider ngrams composed of only stopwords stops = [t for t in ngram if t in stoplist] if len(stops) == len(ngram): continue # stem the ngram ngram = [stemmer.stem(t) for t in ngram] # add the ngram to the concepts concepts.append(' '.join(ngram)) return concepts
Example #28
Source File: submodular_funcs.py From DiPS with Apache License 2.0 | 5 votes |
def ngram_toks(sents, n=1): ntoks =[] for sent in sents: ntok = list(ngrams(sent.split(), n)) newtoks = [tok for tok in ntok] ntoks+= newtoks return ntoks
Example #29
Source File: document_analysis.py From semanticRetrievalMRS with MIT License | 5 votes |
def get_ngrams(terms, poss=None, n=1, included_tags=None, as_strings=True): """Returns a list of all ngrams from length 1 to n. """ ngrams = [(s, e + 1) for s in range(len(terms)) for e in range(s, min(s + n, len(terms)))] if poss is not None and included_tags is not None: # We do filtering according to pos. # ngrampos = [(s, e + 1) # for s in range(len(poss)) # for e in range(s, min(s + n, len(poss)))] filtered_ngram = [] for (s, e) in ngrams: if any([poss[i] in included_tags for i in range(s, e)]): filtered_ngram.append((s, e)) ngrams = filtered_ngram # Concatenate into strings if as_strings: ngrams = ['{}'.format(' '.join(terms[s:e])) for (s, e) in ngrams] return ngrams # Open class words Closed class words Other # ADJ ADP PUNCT # ADV AUX SYM # INTJ CCONJ X # NOUN DET # PROPN NUM # VERB PART # PRON # SCONJ
Example #30
Source File: extract_ml_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_ngrams(tokens, n, syntactic_data=False): if len(n) < 1: return {} if not syntactic_data: filtered = [] stopwords = data_proc.get_stopwords_list() for t in tokens: if t not in stopwords and t.isalnum(): filtered.append(t) tokens = filtered ngram_tokens = [] for i in n: for gram in ngrams(tokens, i): string_token = str(i) + '-gram ' for j in range(i): string_token += gram[j] + ' ' ngram_tokens.append(string_token) ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)} return ngram_features # Get sentiment features -- a total of 16 features derived # Emoji features: a count of the positive, negative and neutral emojis # along with the ratio of positive to negative emojis and negative to neutral # Using the MPQA subjectivity lexicon, we have to check words for their part of speech # and obtain features: a count of positive, negative and neutral words, as well as # a count of the strong and weak subjectives, along with their ratios and a total sentiment words. # Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)