Python nltk.ngrams() Examples
The following are 30
code examples of nltk.ngrams().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.

Example #1
Source File: eval.py From SARC with MIT License | 6 votes |
def parse(): parser = argparse.ArgumentParser() parser.add_argument('dataset', help='pol or main', type=str) parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int) parser.add_argument('--min_count', default=1, help='Min count', type=int) parser.add_argument('--embedding', default=CCGLOVE, help='embedding file', type=str) parser.add_argument('--weights', default=None, help='weights to use for ngrams (e.g. sif, None)', type=str) parser.add_argument('-norm', '--normalize', action='store_true', help='Normalize vectors') parser.add_argument('-l', '--lower', action='store_true', help='Whether or not to lowercase text') parser.add_argument('-e', '--embed', action='store_true', help='Use embeddings instead of bong') return parser.parse_args()
Example #2
Source File: experiments.py From clickbait with MIT License | 6 votes |
def n_gram_analysis_simple(infile, gram, stop): ngram = dict() f = open(infile, "r" ) #f2 = codecs.open(outfile, "w+", "utf-8") for l in f: x = nltk.ngrams(l.split(),gram) for w in x: # if stop: # if w not in stops: # if w in ngram: # ngram[w]+=1 # else: # ngram[w]=1 if w in ngram: ngram[w] += 1 else: ngram[w] = 1 p = list(ngram.items()) p.sort(key = lambda x: -x[1]) print len(p) for x in p[:10]: sen = ' '.join(x[0]) cnt = int(x[1]) if cnt == 0: cnt = 1 print sen, cnt
Example #3
Source File: compute.py From ALaCarte with MIT License | 6 votes |
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None): '''sliding window around n-grams in a document Args: strdoc: list of tokens (as strings) intdoc: list of indices (as ints); len(intdoc) == len(strdoc) vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys) n: n in n-gram wndo2: half the window size unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams Returns: (n-gram, int generator) generator over (n-gram, context window pairs) ''' wndo2pn = wndo2+n unk = not unkgram is None for i, ngram in enumerate(nltk.ngrams(strdoc, n)): if ngram in vocabulary: yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) elif unk: yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
Example #4
Source File: ngram.py From ALaCarte with MIT License | 6 votes |
def alabong(A, word_embeddings, lists, coocs, counts): n = len(lists) def represent(documents): output = [] docs = tokenize(doc.lower() for doc in documents) for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts): kgrams = [list(nltk.ngrams(doc, k)) for doc in docs] vocab = {kgram for doc in kgrams for kgram in doc} where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]]) bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc') output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT)) for offset in range(0, where.shape[0], MAXSLICE): indices = where[offset:offset+MAXSLICE] if k > 1: vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k else: vecs = normalize(word_embeddings[indices]) output[-1] += bong[:,indices].dot(vecs) return np.hstack(output) return represent, None, True
Example #5
Source File: ngrams.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8", tokenize_method=nltk.word_tokenize): """ Get distribution of skipgrams with given n and k values from input_buffer. :param input_buffer: :param n: :param k: :param encoding: :param tokenize_method: :return: """ # Ensure we have a decoded string if isinstance(input_buffer, bytes): input_buffer = input_buffer.decode(encoding) ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n) return nltk.util.skipgrams(ngrams, n, k)
Example #6
Source File: data_helpers.py From coling2018_fake-news-challenge with Apache License 2.0 | 6 votes |
def extract_ngrams(text, stemmer, N): ''' Parameter Arguments: text: 'Ney York is a city. It has a huge population.' N: Length of the n-grams e.g. 1, 2 return: a list of n-grams [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' ngrams_list = [] ngram_items = list(ngrams(sent2stokens(text, stemmer), N)) for i, ngram in enumerate(ngram_items): ngram_str = ' '.join(ngram) ngrams_list.append(ngram_str) return ngrams_list
Example #7
Source File: mention_extraction.py From starsem2018-entity-linking with Apache License 2.0 | 6 votes |
def parse(self, tagged_text, ngram_len=-1): ngrams = [] if len(tagged_text) == 0: return ngrams if tagged_text[0]['pos'] in self._exclude_if_first: tagged_text = tagged_text[1:] if ngram_len == -1: for l in range(len(tagged_text), 0, -1): ngrams += list(nltk.ngrams(tagged_text, l)) else: ngrams += list(nltk.ngrams(tagged_text, ngram_len)) ngrams += [n[:-1] for n in ngrams if len(n) > 1 and n[-1]['pos'] in {"NN", "NNS"}] ngrams += [n[1:] for n in ngrams if len(n) > 1 and n[0]['pos'] in {"NN", "NNS"}] ngrams = [n for n in ngrams if len({el[i] for el in n for i in {'pos', 'ner'}} & self._exclude_pos) == 0 and (len(n) == 1 or (n[0]['pos'] not in self._exclude_prefix and n[0]['word'].lower() not in utils.stop_words_en and n[-1]['pos'] not in self._exclude_suffix and n[-1]['word'].lower() not in utils.stop_words_en) ) and not(len(n) == 1 and (n[0]['pos'] in self._exclude_alone or n[0]['word'].lower() in utils.stop_words_en))] return ngrams
Example #8
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]: """ Based on the current utterance, return a dictionary where the keys are the strings in the database that map to lists of the token indices that they are linked to. """ string_linking_scores: Dict[str, List[int]] = defaultdict(list) for index, token in enumerate(tokenized_utterance): for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []): string_linking_scores[string].append(index) token_bigrams = bigrams([token.text for token in tokenized_utterance]) for index, token_bigram in enumerate(token_bigrams): for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []): string_linking_scores[string].extend([index, index + 1]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for index, trigram in enumerate(trigrams): if trigram[0] == "st": natural_language_key = f"st. {trigram[2]}".lower() else: natural_language_key = " ".join(trigram).lower() for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []): string_linking_scores[string].extend([index, index + 1, index + 2]) return string_linking_scores
Example #9
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_time_range_start_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: late_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "late" } time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): for time in TIME_RANGE_START_DICT.get(token.text, []): if token_index - 1 not in late_indices: time_range_start_linking_dict[str(time)].append(token_index) bigrams = ngrams([token.text for token in tokenized_utterance], 2) for bigram_index, bigram in enumerate(bigrams): for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []): time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1]) return time_range_start_linking_dict
Example #10
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_time_range_end_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: early_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "early" } time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): for time in TIME_RANGE_END_DICT.get(token.text, []): if token_index - 1 not in early_indices: time_range_end_linking_dict[str(time)].append(token_index) bigrams = ngrams([token.text for token in tokenized_utterance], 2) for bigram_index, bigram in enumerate(bigrams): for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []): time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1]) return time_range_end_linking_dict
Example #11
Source File: word.py From flambe with MIT License | 6 votes |
def __init__(self, ngrams: Union[int, List[int]] = 1, exclude_stopwords: bool = False, stop_words: Optional[List] = None) -> None: """ Initialize the NGramsTokenizer Parameters ---------- ngrams : Union[int, List[int]], optional [description], by default 1 exclude_stopwords: bool [description], by default False stop_words: Optional[List] [description], by default None """ self.ngrams = ngrams self.exclude_stopwords = exclude_stopwords if self.exclude_stopwords: self.stop_words = stop_words if self.stop_words is None: nltk.download('stopwords', quiet=True) self.stop_words = stopwords.words('english') nltk.download('punkt', quiet=True)
Example #12
Source File: word.py From flambe with MIT License | 6 votes |
def tokenize(self, example: str) -> List[str]: """Tokenize an input example. Parameters ---------- example : str The input example, as a string. Returns ------- List[str] The output word tokens, as a list of strings """ if self.exclude_stopwords and self.stop_words: example = ' '.join([word for word in word_tokenize(example) if word not in self.stop_words]) if isinstance(self.ngrams, List): ret: List[str] = [] for i in self.ngrams: ret.extend(self._tokenize(example, i)) return ret else: return NGramsTokenizer._tokenize(example, self.ngrams)
Example #13
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def extract_ngrams2(sentences, stemmer, language, N=2): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] N: Length of the n-grams e.g. 1, 2 return: a list of n-grams [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' ngrams_list = [] for sent in sentences: sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N)) for i, ngram in enumerate(ngram_items): ngram_str = ' '.join(ngram) ngrams_list.append(ngram_str) return ngrams_list
Example #14
Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def extract_nuggets(sentences, nugget_type, language): ''' Parameter Arguments: sentences: list of sentences ['Ney York is a city.', 'It has a huge population.'] return: a list of noun phrases, events, named_entities [('new', 'york'), ('york', 'is'), ('a', 'city'), ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')] ''' nugget_list = [] for sent in sentences: if nugget_type == 'n-grams': nugget_items = list(ngrams(sent2stokens(sent, language), 2)) if nugget_type == 'NP': nugget_items = get_phrases(sent, 'NP') if nugget_type == 'Phrases': nugget_items = get_phrases(sent, 'Phrases') if nugget_type == 'NE': nugget_items = get_phrases(sent, 'NE') for nugget in nugget_items: nugget_list.append(' '.join(nugget)) return nugget_list
Example #15
Source File: feedback_graph.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def add_sentences(self, sentences): """ @type sentences: list[Sentence] """ counter = self.counter G = self.G for sent in sentences: counter.update(ngrams(sent.tokens, self.N)) G.add_nodes_from(sent.tokens) updated_edges = [] for v in counter.elements(): s = v[0] t = v[1] c = counter[v] updated_edges.append((s, t, c)) G.add_weighted_edges_from(updated_edges)
Example #16
Source File: lang_model_2.py From jakaton_feminicidios with MIT License | 6 votes |
def __init__(self, order, alpha, sentences): self.order = order self.alpha = alpha if order > 1: self.backoff = LangModel(order - 1, alpha, sentences) self.lexicon = None else: self.backoff = None self.n = 0 self.ngramFD = nltk.FreqDist() lexicon = set() for sentence in sentences: words = nltk.word_tokenize(sentence) wordNGrams = nltk.ngrams(words, order) for wordNGram in wordNGrams: self.ngramFD[wordNGram] += 1 # self.ngramFD.inc(wordNGram) if order == 1: lexicon.add(wordNGram) self.n += 1 self.v = len(lexicon)
Example #17
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def __init__(self, ngrams, **kwargs): super().__init__(**kwargs) self.ngrams = ngrams
Example #18
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def get_parameter_choices(): return {"ngrams": [1, 2, 3]}
Example #19
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def transform(self, X: dt.Frame): output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams)) text2 = text2_arr[ind] text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams)) output.append(len(text1.intersection(text2))) except: output.append(-1) return np.array(output)
Example #20
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def __init__(self, ngrams, **kwargs): super().__init__(**kwargs) self.ngrams = ngrams
Example #21
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def get_parameter_choices(): return {"ngrams": [1, 2, 3]}
Example #22
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def __init__(self, ngrams, **kwargs): super().__init__(**kwargs) self.ngrams = ngrams
Example #23
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def get_parameter_choices(): return {"ngrams": [1, 2, 3]}
Example #24
Source File: text_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def transform(self, X: dt.Frame): output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams)) text2 = text2_arr[ind] text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams)) output.append((2 * len(text1.intersection(text2))) / (len(text1) + len(text2))) except: output.append(-1) return np.array(output)
Example #25
Source File: sf_shingling.py From screaming-frog-shingling with MIT License | 5 votes |
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200): split_text = text.split() if len(split_text) < shingle_length: raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length)) self.minhash = [] self.shingles = ngrams(split_text, shingle_length) for hash_seed in generate_random_seeds(minhash_size, random_seed): min_value = float('inf') for shingle in ngrams(split_text, shingle_length): value = mmh3.hash(' '.join(shingle), hash_seed) min_value = min(min_value, value) self.minhash.append(min_value)
Example #26
Source File: test_ngram.py From numpy-ml with GNU General Public License v3.0 | 5 votes |
def log_prob(self, words, N): assert N in self.counts, "You do not have counts for {}-grams".format(N) if N > len(words): err = "Not enough words for a gram-size of {}: {}".format(N, len(words)) raise ValueError(err) total_prob = 0 for ngram in nltk.ngrams(words, N): total_prob += self._log_ngram_prob(ngram) return total_prob
Example #27
Source File: test_ngram.py From numpy-ml with GNU General Public License v3.0 | 5 votes |
def log_prob(self, words, N): assert N in self.counts, "You do not have counts for {}-grams".format(N) if N > len(words): err = "Not enough words for a gram-size of {}: {}".format(N, len(words)) raise ValueError(err) total_prob = 0 for ngram in nltk.ngrams(words, N): total_prob += self._log_ngram_prob(ngram) return total_prob
Example #28
Source File: ngrams.py From textkit with MIT License | 5 votes |
def words2ngrams(sep, num, tokens): '''Convert word tokens into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, num)) write_csv(ngrams, str(sep))
Example #29
Source File: ngrams.py From textkit with MIT License | 5 votes |
def text2ngrams(sep, num, text): '''Tokenize plain text into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = '\n'.join([open(f).read() for f in text]) try: tokens = nltk.word_tokenize(content) ngrams = list(nltk.ngrams(tokens, num)) write_csv(ngrams, str(sep)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err)
Example #30
Source File: alacarte.py From ALaCarte with MIT License | 5 votes |
def read_ngrams(self, tokens): '''reads tokens and updates context vectors Args: tokens: list of strings Returns: None ''' import nltk # gets location of target n-grams in document target_vocab = self.target_vocab max_n = self.max_n ngrams = dict() for n in range(1, max_n + 1): ngrams[n] = list(filter(lambda entry: entry[1] in target_vocab, enumerate(nltk.ngrams(tokens, n)))) for n in range(1, max_n + 1): if ngrams[n]: # gets word embedding for each token w2v = self.w2v zero_vector = self.zero_vector wnd = self.wnd start = max(0, ngrams[n][0][0] - wnd) vectors = [None] * start + [w2v.get(token, zero_vector) if token else zero_vector for token in tokens[start:ngrams[n][-1][0] + n + wnd]] c2v = self.c2v target_counts = self.target_counts # computes context vector around each target n-gram for i, ngram in ngrams[n]: c2v[ngram] += sum(vectors[max(0, i - wnd):i], zero_vector) + sum(vectors[i + n:i + n + wnd], zero_vector) target_counts[ngram] += 1