Python nltk.ngrams() Examples

The following are 30 code examples of nltk.ngrams(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function

Example #1

Source File: data_helpers.py From coling2018_fake-news-challenge with Apache License 2.0

6 votes

def extract_ngrams(text, stemmer, N):
    '''
    Parameter Arguments:
    text: 'Ney York is a city. It has a huge population.'
    N: Length of the n-grams e.g. 1, 2
    
    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    ngram_items = list(ngrams(sent2stokens(text, stemmer), N))
    for i, ngram in enumerate(ngram_items):
        ngram_str = ' '.join(ngram)
        ngrams_list.append(ngram_str)
    return ngrams_list

Example #2

Source File: word.py From flambe with MIT License

6 votes

def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string.

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        if self.exclude_stopwords and self.stop_words:
            example = ' '.join([word for word in word_tokenize(example)
                                if word not in self.stop_words])

        if isinstance(self.ngrams, List):
            ret: List[str] = []
            for i in self.ngrams:
                ret.extend(self._tokenize(example, i))
            return ret
        else:
            return NGramsTokenizer._tokenize(example, self.ngrams)

Example #3

Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0

6 votes

def extract_ngrams2(sentences, stemmer, language, N=2):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    N: Length of the n-grams e.g. 1, 2
    
    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    for sent in sentences:
        sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
        ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
        for i, ngram in enumerate(ngram_items):
            ngram_str = ' '.join(ngram)
          
            ngrams_list.append(ngram_str)
    return ngrams_list

Example #4

Source File: word.py From flambe with MIT License

6 votes

def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True)

Example #5

Source File: experiments.py From clickbait with MIT License

6 votes

def n_gram_analysis_simple(infile, gram, stop):
	ngram = dict()
	f = open(infile, "r" )
	#f2 = codecs.open(outfile, "w+", "utf-8")
	for l in f:
	    x = nltk.ngrams(l.split(),gram)
	    for w in x:
	    	# if stop:
	    	# 	if w not in stops:
			   #      if w in ngram:
			   #          ngram[w]+=1
			   #      else:
			   #      	ngram[w]=1
			if w in ngram:
				ngram[w] += 1
			else:
				ngram[w] = 1
	p = list(ngram.items())
	p.sort(key = lambda x: -x[1])
	print len(p)
	for x in p[:10]:
		sen = ' '.join(x[0])
		cnt = int(x[1])
		if cnt == 0:
			cnt = 1
		print sen, cnt

Example #6

Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0

6 votes

def extract_nuggets(sentences, nugget_type, language):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    
    return: a list of noun phrases, events, named_entities
    [('new', 'york'), ('york', 'is'), ('a', 'city'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    nugget_list = []
    for sent in sentences:
        if nugget_type == 'n-grams':
            nugget_items = list(ngrams(sent2stokens(sent, language), 2))
        if nugget_type == 'NP':
            nugget_items = get_phrases(sent, 'NP')
        if nugget_type == 'Phrases':
            nugget_items = get_phrases(sent, 'Phrases')
        if nugget_type == 'NE':
            nugget_items = get_phrases(sent, 'NE')
        for nugget in nugget_items:
            nugget_list.append(' '.join(nugget))
    return nugget_list

Example #7

Source File: atis_tables.py From allennlp-semparse with Apache License 2.0

6 votes

def get_time_range_end_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    early_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "early"
    }

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict

Example #8

Source File: feedback_graph.py From acl2017-interactive_summarizer with Apache License 2.0

6 votes

def add_sentences(self, sentences):
        """
        @type sentences: list[Sentence]
        """
        counter = self.counter
        G = self.G
        for sent in sentences:
            counter.update(ngrams(sent.tokens, self.N))
            G.add_nodes_from(sent.tokens)

        updated_edges = []
        for v in counter.elements():
            s = v[0]
            t = v[1]
            c = counter[v]
            updated_edges.append((s, t, c))

        G.add_weighted_edges_from(updated_edges)

Example #9

Source File: compute.py From ALaCarte with MIT License

6 votes

def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
  '''sliding window around n-grams in a document
  Args:
    strdoc: list of tokens (as strings)
    intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
    vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
    n: n in n-gram
    wndo2: half the window size
    unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
  Returns:
    (n-gram, int generator) generator over (n-gram, context window pairs)
  '''

  wndo2pn = wndo2+n
  unk = not unkgram is None
  for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
    if ngram in vocabulary:
      yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
    elif unk:
      yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])

Example #10

Source File: ngram.py From ALaCarte with MIT License

6 votes

def alabong(A, word_embeddings, lists, coocs, counts):
  n = len(lists)
  def represent(documents):
    output = []
    docs = tokenize(doc.lower() for doc in documents)
    for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts):
      kgrams = [list(nltk.ngrams(doc, k)) for doc in docs]
      vocab = {kgram for doc in kgrams for kgram in doc}
      where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]])
      bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc')
      output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT))
      for offset in range(0, where.shape[0], MAXSLICE):
        indices = where[offset:offset+MAXSLICE]
        if k > 1:
          vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k
        else:
          vecs = normalize(word_embeddings[indices])
        output[-1] += bong[:,indices].dot(vecs)
    return np.hstack(output)
  return represent, None, True

Example #11

Source File: atis_world.py From allennlp-semparse with Apache License 2.0

6 votes

def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores

Example #12

Source File: mention_extraction.py From starsem2018-entity-linking with Apache License 2.0

6 votes

def parse(self, tagged_text, ngram_len=-1):
        ngrams = []
        if len(tagged_text) == 0:
            return ngrams
        if tagged_text[0]['pos'] in self._exclude_if_first:
            tagged_text = tagged_text[1:]
        if ngram_len == -1:
            for l in range(len(tagged_text), 0, -1):
                ngrams += list(nltk.ngrams(tagged_text, l))
        else:
            ngrams += list(nltk.ngrams(tagged_text, ngram_len))
            ngrams += [n[:-1] for n in ngrams if len(n) > 1 and n[-1]['pos'] in {"NN", "NNS"}]
            ngrams += [n[1:] for n in ngrams if len(n) > 1 and n[0]['pos'] in {"NN", "NNS"}]
        ngrams = [n for n in ngrams
                  if len({el[i] for el in n for i in {'pos', 'ner'}} & self._exclude_pos) == 0
                  and (len(n) == 1 or (n[0]['pos'] not in self._exclude_prefix
                           and n[0]['word'].lower() not in utils.stop_words_en
                           and n[-1]['pos'] not in self._exclude_suffix
                           and n[-1]['word'].lower() not in utils.stop_words_en)
                       )
                  and not(len(n) == 1 and (n[0]['pos'] in self._exclude_alone or n[0]['word'].lower() in utils.stop_words_en))]
        return ngrams

Example #13

Source File: lang_model_2.py From jakaton_feminicidios with MIT License

6 votes

def __init__(self, order, alpha, sentences):
        self.order = order
        self.alpha = alpha
        if order > 1:
            self.backoff = LangModel(order - 1, alpha, sentences)
            self.lexicon = None
        else:
            self.backoff = None
            self.n = 0
        self.ngramFD = nltk.FreqDist()
        lexicon = set()
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            wordNGrams = nltk.ngrams(words, order)
            for wordNGram in wordNGrams:
                self.ngramFD[wordNGram] += 1
                # self.ngramFD.inc(wordNGram)
                if order == 1:
                    lexicon.add(wordNGram)
                    self.n += 1
        self.v = len(lexicon)

Example #14

Source File: ngrams.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8",
                                   tokenize_method=nltk.word_tokenize):
    """
    Get distribution of skipgrams with given n and k values from input_buffer.
    :param input_buffer:
    :param n:
    :param k:
    :param encoding:
    :param tokenize_method:
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n)
    return nltk.util.skipgrams(ngrams, n, k)

Example #15

Source File: eval.py From SARC with MIT License

6 votes

def parse():
  parser = argparse.ArgumentParser()
  parser.add_argument('dataset', help='pol or main', type=str)
  parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
  parser.add_argument('--min_count', default=1, help='Min count', type=int)
  parser.add_argument('--embedding', default=CCGLOVE,
                      help='embedding file', type=str)
  parser.add_argument('--weights', default=None,
                      help='weights to use for ngrams (e.g. sif, None)', type=str)
  parser.add_argument('-norm', '--normalize', action='store_true',
                      help='Normalize vectors')
  parser.add_argument('-l', '--lower', action='store_true',
                      help='Whether or not to lowercase text')
  parser.add_argument('-e', '--embed', action='store_true',
                      help='Use embeddings instead of bong')
  return parser.parse_args()

Example #16

Source File: atis_tables.py From allennlp-semparse with Apache License 2.0

6 votes

def get_time_range_start_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    late_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "late"
    }

    time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_START_DICT.get(token.text, []):
            if token_index - 1 not in late_indices:
                time_range_start_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []):
            time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_start_linking_dict

Example #17

Source File: utils.py From Transferable-E2E-ABSA with MIT License

5 votes

def set_wid(dataset, vocab, win=1):
    """
    set wid field for the dataset
    :param dataset: dataset
    :param vocab: vocabulary
    :param win: context window size, for window-based input, should be an odd number
    :return: dataset with field wid
    """
    n_records = len(dataset)
    for i in range(n_records):
        words = dataset[i]['words']
        lm_labels = []
        # set labels for the auxiliary language modeling task
        for w in words:
            lm_labels.append(vocab[w])
        dataset[i]['lm_labels'] = list(lm_labels)
        n_padded_words = win // 2
        pad_left = ['PADDING' for _ in range(n_padded_words)]
        pad_right = ['PADDING' for _ in range(n_padded_words)]
        padded_words = pad_left + words + pad_right
        # the window-based input
        win_input = list(ngrams(padded_words, win))
        assert len(win_input) == len(words)
        n_grams = []
        for t in win_input:
            n_grams.append(t)
        wids = [[vocab[w] for w in ngram] for ngram in n_grams]
        dataset[i]['wids'] = list(wids)
    return dataset

Example #18

Source File: extract_statistical_features.py From Sarcasm-Detection with MIT License

5 votes

def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
    if len(n) < 1:
        return {}
    if not for_semantics:
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(t.lower()) for t in tokens]
        if use_just_words:
            tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
                      and t not in string.punctuation]
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = 'gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)

Example #19

Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0

5 votes

def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input) - n + 1):
        output.append(input[i:i + n])
    return output

Example #20

Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0

5 votes

def prune_ngrams(ngrams, stoplist, N=2):
    pruned_list = []
    for ngram in ngrams:
        items = ngram.split(' ')
        i = 0
        for item in items:
            if item in stoplist: i += 1
        if i < N:
            pruned_list.append(ngram)
    return pruned_list

Example #21

Source File: preprocessing.py From tensorflow-nlp-examples with MIT License

5 votes

def generate_ngrams(self, words, n=7):
        res = []
        seqlen = len(words)
        for i in range(1, n + 1):
            for ngram in ngrams(range(seqlen), i):
                l, r = ngram[0], ngram[-1] + 1
                res.append((l, r))
        return res

Example #22

Source File: distinct.py From cotk with Apache License 2.0

5 votes

def close(self):
		'''
		Returns:
			(dict): Return a dict which contains

			* **bleu**: bleu value.
			* **bleu hashvalue**: hash value for bleu metric, same hash value stands
			  for same evaluation settings.
		'''
		result = super().close()
		if not self.hyps:
			raise RuntimeError("The metric has not been forwarded data correctly.")

		if self.sample > len(self.hyps):
			sample = len(self.hyps)
		else:
			sample = self.sample
		self._hash_ordered_data(sample)

		rng_state = random.getstate()
		random.seed(self.seed)
		random.shuffle(self.hyps)
		random.setstate(rng_state)
		self.hyps = self.hyps[:sample]

		if self.tokenizer:
			self._do_tokenize()

		if "unk" in self.dataloader.get_special_tokens_mapping():
			self.hyps = replace_unk(self.hyps, unk = self.dataloader.get_special_tokens_mapping().get("unk", None))

		ngram_list = list(chain(*[ngrams(sentence, self.ngram, pad_left=True, pad_right=True) for sentence in self.hyps]))
		ngram_set = set(ngram_list)

		result.update({"distinct": len(ngram_set) / len(ngram_list), \
			"distinct hashvalue": self._hashvalue()})
		return result

Example #23

Source File: word.py From flambe with MIT License

5 votes

def _tokenize(example: str, n: int) -> List[str]:
        """Tokenize an input example using ngrams.

        """
        return list(" ".join(x) if len(x) > 1 else x[0] for x in ngrams(word_tokenize(example), n))

Example #24

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

5 votes

def get_ngram_list(tknzr, text, n):
    tokens = tknzr.tokenize(text)
    tokens = [t for t in tokens if not t.startswith('#')]
    tokens = [t for t in tokens if not t.startswith('@')]
    ngram_list = [gram for gram in ngrams(tokens, n)]
    return ngram_list

Example #25

Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0

5 votes

def _create_hash_features(self, df):

        def get_word_ngrams(sequence, n=3):
            return [' '.join(ngram) for ngram in ngrams(sequence, n)]

        def get_character_ngrams(sequence, n=3):
            sequence = ' '.join(sequence)
            return [sequence[i:i+n] for i in range(len(sequence)-n+1)]

        def calculate_simhash_distance(sequence1, sequence2):
            return Simhash(sequence1).distance(Simhash(sequence2))
            
        def calculate_all_simhash(row):
            q1, q2 = row['splited_spn_1'], row['splited_spn_2']
            simhash_distance = calculate_simhash_distance(q1, q2)

            q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2)
            simhash_distance_2gram = calculate_simhash_distance(q1, q2)

            q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3)
            simhash_distance_3gram = calculate_simhash_distance(q1, q2)

            q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2)
            simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2)
           
            q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3)
            simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2)

            return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram, simhash_distance_ch_2gram, simhash_distance_ch_3gram)

        df['sim_hash'] = df.apply(calculate_all_simhash, axis=1, raw=True)
        df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0]))
        df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1]))
        df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2]))
        df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3]))
        df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))

Example #26

Source File: distinct_metric.py From DiPS with Apache License 2.0

5 votes

def ngram_toks(sents, n=1):
    ntoks =[]
    for sent in sents:
        ntok = list(ngrams(sent.split(), n))
        newtoks = [tok for tok in ntok]
        ntoks+= newtoks
    return ntoks

Example #27

Source File: data_helpers.py From acl2017-interactive_summarizer with Apache License 2.0

5 votes

def extract_ngrams(sentences, stoplist, stemmer, language, n=2):
    """Extract the ngrams of words from the input sentences.

    Args:
        n (int): the number of words for ngrams, defaults to 2
    """
    concepts = []
    for i, sentence in enumerate(sentences):

        # for each ngram of words
        tokens = sent2tokens(sentence, language)
        for j in range(len(tokens)-(n-1)):

            # initialize ngram container
            ngram = []

            # for each token of the ngram
            for k in range(j, j+n):
                ngram.append(tokens[k].lower())

            # do not consider ngrams containing punctuation marks
            marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
            if len(marks) > 0:
                continue

            # do not consider ngrams composed of only stopwords
            stops = [t for t in ngram if t in stoplist]
            if len(stops) == len(ngram):
                continue

            # stem the ngram
            ngram = [stemmer.stem(t) for t in ngram]

            # add the ngram to the concepts
            concepts.append(' '.join(ngram))
    return concepts

Example #28

Source File: submodular_funcs.py From DiPS with Apache License 2.0

5 votes

def ngram_toks(sents, n=1):
    ntoks =[]
    for sent in sents:
        ntok = list(ngrams(sent.split(), n))
        newtoks = [tok for tok in ntok]
        ntoks+= newtoks
    return ntoks

Example #29

Source File: document_analysis.py From semanticRetrievalMRS with MIT License

5 votes

def get_ngrams(terms, poss=None, n=1, included_tags=None, as_strings=True):
    """Returns a list of all ngrams from length 1 to n.
    """
    ngrams = [(s, e + 1)
              for s in range(len(terms))
              for e in range(s, min(s + n, len(terms)))]

    if poss is not None and included_tags is not None:  # We do filtering according to pos.
        # ngrampos = [(s, e + 1)
        #             for s in range(len(poss))
        #             for e in range(s, min(s + n, len(poss)))]

        filtered_ngram = []
        for (s, e) in ngrams:
            if any([poss[i] in included_tags for i in range(s, e)]):
                filtered_ngram.append((s, e))

        ngrams = filtered_ngram

    # Concatenate into strings
    if as_strings:
        ngrams = ['{}'.format(' '.join(terms[s:e])) for (s, e) in ngrams]

    return ngrams


# Open class words	Closed class words	Other
# ADJ	            ADP	                PUNCT
# ADV	            AUX	                SYM
# INTJ	            CCONJ	            X
# NOUN	            DET
# PROPN	            NUM
# VERB	            PART
#                   PRON
#                   SCONJ

Example #30

Source File: extract_ml_features.py From Sarcasm-Detection with MIT License

5 votes

def get_ngrams(tokens, n, syntactic_data=False):
    if len(n) < 1:
        return {}
    if not syntactic_data:
        filtered = []
        stopwords = data_proc.get_stopwords_list()
        for t in tokens:
            if t not in stopwords and t.isalnum():
                filtered.append(t)
        tokens = filtered
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = str(i) + '-gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 16 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)