Python nltk.ngrams() Examples

The following are 30 code examples for showing how to use nltk.ngrams(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: SARC   Author: NLPrinceton   File: eval.py    License: MIT License 6 votes vote down vote up
def parse():
  parser = argparse.ArgumentParser()
  parser.add_argument('dataset', help='pol or main', type=str)
  parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
  parser.add_argument('--min_count', default=1, help='Min count', type=int)
  parser.add_argument('--embedding', default=CCGLOVE,
                      help='embedding file', type=str)
  parser.add_argument('--weights', default=None,
                      help='weights to use for ngrams (e.g. sif, None)', type=str)
  parser.add_argument('-norm', '--normalize', action='store_true',
                      help='Normalize vectors')
  parser.add_argument('-l', '--lower', action='store_true',
                      help='Whether or not to lowercase text')
  parser.add_argument('-e', '--embed', action='store_true',
                      help='Use embeddings instead of bong')
  return parser.parse_args() 
Example 2
Project: ALaCarte   Author: NLPrinceton   File: compute.py    License: MIT License 6 votes vote down vote up
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
  '''sliding window around n-grams in a document
  Args:
    strdoc: list of tokens (as strings)
    intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
    vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
    n: n in n-gram
    wndo2: half the window size
    unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
  Returns:
    (n-gram, int generator) generator over (n-gram, context window pairs)
  '''

  wndo2pn = wndo2+n
  unk = not unkgram is None
  for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
    if ngram in vocabulary:
      yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
    elif unk:
      yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) 
Example 3
Project: ALaCarte   Author: NLPrinceton   File: ngram.py    License: MIT License 6 votes vote down vote up
def alabong(A, word_embeddings, lists, coocs, counts):
  n = len(lists)
  def represent(documents):
    output = []
    docs = tokenize(doc.lower() for doc in documents)
    for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts):
      kgrams = [list(nltk.ngrams(doc, k)) for doc in docs]
      vocab = {kgram for doc in kgrams for kgram in doc}
      where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]])
      bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc')
      output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT))
      for offset in range(0, where.shape[0], MAXSLICE):
        indices = where[offset:offset+MAXSLICE]
        if k > 1:
          vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k
        else:
          vecs = normalize(word_embeddings[indices])
        output[-1] += bong[:,indices].dot(vecs)
    return np.hstack(output)
  return represent, None, True 
Example 4
Project: lexpredict-contraxsuite   Author: LexPredict   File: ngrams.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8",
                                   tokenize_method=nltk.word_tokenize):
    """
    Get distribution of skipgrams with given n and k values from input_buffer.
    :param input_buffer:
    :param n:
    :param k:
    :param encoding:
    :param tokenize_method:
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n)
    return nltk.util.skipgrams(ngrams, n, k) 
Example 5
Project: coling2018_fake-news-challenge   Author: UKPLab   File: data_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def extract_ngrams(text, stemmer, N):
    '''
    Parameter Arguments:
    text: 'Ney York is a city. It has a huge population.'
    N: Length of the n-grams e.g. 1, 2
    
    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    ngram_items = list(ngrams(sent2stokens(text, stemmer), N))
    for i, ngram in enumerate(ngram_items):
        ngram_str = ' '.join(ngram)
        ngrams_list.append(ngram_str)
    return ngrams_list 
Example 6
Project: starsem2018-entity-linking   Author: UKPLab   File: mention_extraction.py    License: Apache License 2.0 6 votes vote down vote up
def parse(self, tagged_text, ngram_len=-1):
        ngrams = []
        if len(tagged_text) == 0:
            return ngrams
        if tagged_text[0]['pos'] in self._exclude_if_first:
            tagged_text = tagged_text[1:]
        if ngram_len == -1:
            for l in range(len(tagged_text), 0, -1):
                ngrams += list(nltk.ngrams(tagged_text, l))
        else:
            ngrams += list(nltk.ngrams(tagged_text, ngram_len))
            ngrams += [n[:-1] for n in ngrams if len(n) > 1 and n[-1]['pos'] in {"NN", "NNS"}]
            ngrams += [n[1:] for n in ngrams if len(n) > 1 and n[0]['pos'] in {"NN", "NNS"}]
        ngrams = [n for n in ngrams
                  if len({el[i] for el in n for i in {'pos', 'ner'}} & self._exclude_pos) == 0
                  and (len(n) == 1 or (n[0]['pos'] not in self._exclude_prefix
                           and n[0]['word'].lower() not in utils.stop_words_en
                           and n[-1]['pos'] not in self._exclude_suffix
                           and n[-1]['word'].lower() not in utils.stop_words_en)
                       )
                  and not(len(n) == 1 and (n[0]['pos'] in self._exclude_alone or n[0]['word'].lower() in utils.stop_words_en))]
        return ngrams 
Example 7
Project: allennlp-semparse   Author: allenai   File: atis_world.py    License: Apache License 2.0 6 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
Example 8
Project: allennlp-semparse   Author: allenai   File: atis_tables.py    License: Apache License 2.0 6 votes vote down vote up
def get_time_range_start_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    late_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "late"
    }

    time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_START_DICT.get(token.text, []):
            if token_index - 1 not in late_indices:
                time_range_start_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []):
            time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_start_linking_dict 
Example 9
Project: allennlp-semparse   Author: allenai   File: atis_tables.py    License: Apache License 2.0 6 votes vote down vote up
def get_time_range_end_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    early_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "early"
    }

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict 
Example 10
Project: flambe   Author: asappresearch   File: word.py    License: MIT License 6 votes vote down vote up
def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True) 
Example 11
Project: flambe   Author: asappresearch   File: word.py    License: MIT License 6 votes vote down vote up
def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string.

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        if self.exclude_stopwords and self.stop_words:
            example = ' '.join([word for word in word_tokenize(example)
                                if word not in self.stop_words])

        if isinstance(self.ngrams, List):
            ret: List[str] = []
            for i in self.ngrams:
                ret.extend(self._tokenize(example, i))
            return ret
        else:
            return NGramsTokenizer._tokenize(example, self.ngrams) 
Example 12
Project: acl2017-interactive_summarizer   Author: UKPLab   File: data_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def extract_ngrams2(sentences, stemmer, language, N=2):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    N: Length of the n-grams e.g. 1, 2
    
    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    for sent in sentences:
        sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
        ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
        for i, ngram in enumerate(ngram_items):
            ngram_str = ' '.join(ngram)
          
            ngrams_list.append(ngram_str)
    return ngrams_list 
Example 13
Project: acl2017-interactive_summarizer   Author: UKPLab   File: data_helpers.py    License: Apache License 2.0 6 votes vote down vote up
def extract_nuggets(sentences, nugget_type, language):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    
    return: a list of noun phrases, events, named_entities
    [('new', 'york'), ('york', 'is'), ('a', 'city'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    nugget_list = []
    for sent in sentences:
        if nugget_type == 'n-grams':
            nugget_items = list(ngrams(sent2stokens(sent, language), 2))
        if nugget_type == 'NP':
            nugget_items = get_phrases(sent, 'NP')
        if nugget_type == 'Phrases':
            nugget_items = get_phrases(sent, 'Phrases')
        if nugget_type == 'NE':
            nugget_items = get_phrases(sent, 'NE')
        for nugget in nugget_items:
            nugget_list.append(' '.join(nugget))
    return nugget_list 
Example 14
Project: acl2017-interactive_summarizer   Author: UKPLab   File: feedback_graph.py    License: Apache License 2.0 6 votes vote down vote up
def add_sentences(self, sentences):
        """
        @type sentences: list[Sentence]
        """
        counter = self.counter
        G = self.G
        for sent in sentences:
            counter.update(ngrams(sent.tokens, self.N))
            G.add_nodes_from(sent.tokens)

        updated_edges = []
        for v in counter.elements():
            s = v[0]
            t = v[1]
            c = counter[v]
            updated_edges.append((s, t, c))

        G.add_weighted_edges_from(updated_edges) 
Example 15
Project: jakaton_feminicidios   Author: iorch   File: lang_model_2.py    License: MIT License 6 votes vote down vote up
def __init__(self, order, alpha, sentences):
        self.order = order
        self.alpha = alpha
        if order > 1:
            self.backoff = LangModel(order - 1, alpha, sentences)
            self.lexicon = None
        else:
            self.backoff = None
            self.n = 0
        self.ngramFD = nltk.FreqDist()
        lexicon = set()
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            wordNGrams = nltk.ngrams(words, order)
            for wordNGram in wordNGrams:
                self.ngramFD[wordNGram] += 1
                # self.ngramFD.inc(wordNGram)
                if order == 1:
                    lexicon.add(wordNGram)
                    self.n += 1
        self.v = len(lexicon) 
Example 16
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    License: MIT License 5 votes vote down vote up
def n_gram_analysis_simple(infile, gram, stop):
	ngram = dict()
	f = open(infile, "r" )
	#f2 = codecs.open(outfile, "w+", "utf-8")
	for l in f:
	    x = nltk.ngrams(l.split(),gram)
	    for w in x:
	    	# if stop:
	    	# 	if w not in stops:
			   #      if w in ngram:
			   #          ngram[w]+=1
			   #      else:
			   #      	ngram[w]=1
			if w in ngram:
				ngram[w] += 1
			else:
				ngram[w] = 1
	p = list(ngram.items())
	p.sort(key = lambda x: -x[1])
	print len(p)
	for x in p[:10]:
		sen = ' '.join(x[0])
		cnt = int(x[1])
		if cnt == 0:
			cnt = 1
		print sen, cnt 
Example 17
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 18
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 19
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def transform(self, X: dt.Frame):
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams))
                text2 = text2_arr[ind]
                text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams))
                output.append(len(text1.intersection(text2)))
            except:
                output.append(-1)
        return np.array(output) 
Example 20
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 21
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 22
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 23
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 24
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    License: Apache License 2.0 5 votes vote down vote up
def transform(self, X: dt.Frame):
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams))
                text2 = text2_arr[ind]
                text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams))
                output.append((2 * len(text1.intersection(text2))) / (len(text1) + len(text2)))
            except:
                output.append(-1)
        return np.array(output) 
Example 25
Project: screaming-frog-shingling   Author: jroakes   File: sf_shingling.py    License: MIT License 5 votes vote down vote up
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value) 
Example 26
Project: numpy-ml   Author: ddbourgin   File: test_ngram.py    License: GNU General Public License v3.0 5 votes vote down vote up
def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob 
Example 27
Project: numpy-ml   Author: ddbourgin   File: test_ngram.py    License: GNU General Public License v3.0 5 votes vote down vote up
def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob 
Example 28
Project: textkit   Author: learntextvis   File: ngrams.py    License: MIT License 5 votes vote down vote up
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep)) 
Example 29
Project: textkit   Author: learntextvis   File: ngrams.py    License: MIT License 5 votes vote down vote up
def text2ngrams(sep, num, text):
    '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    try:
        tokens = nltk.word_tokenize(content)
        ngrams = list(nltk.ngrams(tokens, num))
        write_csv(ngrams, str(sep))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err) 
Example 30
Project: ALaCarte   Author: NLPrinceton   File: alacarte.py    License: MIT License 5 votes vote down vote up
def read_ngrams(self, tokens):
    '''reads tokens and updates context vectors
    Args:
      tokens: list of strings
    Returns:
      None
    '''

    import nltk

    # gets location of target n-grams in document
    target_vocab = self.target_vocab
    max_n = self.max_n
    ngrams = dict()
    for n in range(1, max_n + 1):
      ngrams[n] = list(filter(lambda entry: entry[1] in target_vocab, enumerate(nltk.ngrams(tokens, n))))

    for n in range(1, max_n + 1):
      if ngrams[n]:

        # gets word embedding for each token
        w2v = self.w2v
        zero_vector = self.zero_vector
        wnd = self.wnd
        start = max(0, ngrams[n][0][0] - wnd)
        vectors = [None] * start + [w2v.get(token, zero_vector) if token else zero_vector for token in
                                    tokens[start:ngrams[n][-1][0] + n + wnd]]
        c2v = self.c2v
        target_counts = self.target_counts

        # computes context vector around each target n-gram
        for i, ngram in ngrams[n]:
          c2v[ngram] += sum(vectors[max(0, i - wnd):i], zero_vector) + sum(vectors[i + n:i + n + wnd],
                                                                           zero_vector)
          target_counts[ngram] += 1