Python nltk.ngrams() Examples

The following are code examples for showing how to use nltk.ngrams(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: SARC   Author: NLPrinceton   File: eval.py    MIT License 6 votes vote down vote up
def parse():
  parser = argparse.ArgumentParser()
  parser.add_argument('dataset', help='pol or main', type=str)
  parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
  parser.add_argument('--min_count', default=1, help='Min count', type=int)
  parser.add_argument('--embedding', default=CCGLOVE,
                      help='embedding file', type=str)
  parser.add_argument('--weights', default=None,
                      help='weights to use for ngrams (e.g. sif, None)', type=str)
  parser.add_argument('-norm', '--normalize', action='store_true',
                      help='Normalize vectors')
  parser.add_argument('-l', '--lower', action='store_true',
                      help='Whether or not to lowercase text')
  parser.add_argument('-e', '--embed', action='store_true',
                      help='Use embeddings instead of bong')
  return parser.parse_args() 
Example 2
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 6 votes vote down vote up
def extract_ngrams2(sentences, stemmer, language, N=2):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    N: Length of the n-grams e.g. 1, 2
    
    return: a list of n-grams
    [('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    ngrams_list = []
    for sent in sentences:
        sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
        ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
        for i, ngram in enumerate(ngram_items):
            ngram_str = ' '.join(ngram)
          
            ngrams_list.append(ngram_str)
    return ngrams_list 
Example 3
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 6 votes vote down vote up
def extract_nuggets(sentences, nugget_type, language):
    '''
    Parameter Arguments:
    sentences: list of sentences
             ['Ney York is a city.', 'It has a huge population.']
    
    return: a list of noun phrases, events, named_entities
    [('new', 'york'), ('york', 'is'), ('a', 'city'), 
    ('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
    '''
    nugget_list = []
    for sent in sentences:
        if nugget_type == 'n-grams':
            nugget_items = list(ngrams(sent2stokens(sent, language), 2))
        if nugget_type == 'NP':
            nugget_items = get_phrases(sent, 'NP')
        if nugget_type == 'Phrases':
            nugget_items = get_phrases(sent, 'Phrases')
        if nugget_type == 'NE':
            nugget_items = get_phrases(sent, 'NE')
        for nugget in nugget_items:
            nugget_list.append(' '.join(nugget))
    return nugget_list 
Example 4
Project: ijcai2019-relis   Author: UKPLab   File: ner_rewarder.py    MIT License 6 votes vote down vote up
def js_reward(self, dataset, topic, docs, summaries):
        ner = parse_ner_chunk_distribution(dataset, topic)
        ner_dist = [ner[key] for key in ner.keys()]
        self.sentences = []
        for doc in docs:
            self.sentences.extend(doc[1])

        rewards = []
        longest_chunk = max([len(chunk) for chunk in ner.keys()])

        for summary in summaries:
            text = ''
            for i in summary:
                text += self.sentences[i]
            token_summary = sent2tokens(text, LANGUAGE)
            sum_ner = {word:0 for word in ner.keys()}
            for length in range(1, longest_chunk+1):
                ngram_summary = ngrams(token_summary, length)
                for token in ngram_summary:
                    if token in ner:
                        sum_ner[token] = sum_ner[token]+1
            js = jsd(ner_dist, [sum_ner[key] for key in ner.keys()])
            rewards.append(js)
        return rewards 
Example 5
Project: ALaCarte   Author: NLPrinceton   File: compute.py    MIT License 6 votes vote down vote up
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
  '''sliding window around n-grams in a document
  Args:
    strdoc: list of tokens (as strings)
    intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
    vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
    n: n in n-gram
    wndo2: half the window size
    unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
  Returns:
    (n-gram, int generator) generator over (n-gram, context window pairs)
  '''

  wndo2pn = wndo2+n
  unk = not unkgram is None
  for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
    if ngram in vocabulary:
      yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
    elif unk:
      yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn]) 
Example 6
Project: ALaCarte   Author: NLPrinceton   File: ngram.py    MIT License 6 votes vote down vote up
def alabong(A, word_embeddings, lists, coocs, counts):
  n = len(lists)
  def represent(documents):
    output = []
    docs = tokenize(doc.lower() for doc in documents)
    for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts):
      kgrams = [list(nltk.ngrams(doc, k)) for doc in docs]
      vocab = {kgram for doc in kgrams for kgram in doc}
      where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]])
      bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc')
      output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT))
      for offset in range(0, where.shape[0], MAXSLICE):
        indices = where[offset:offset+MAXSLICE]
        if k > 1:
          vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k
        else:
          vecs = normalize(word_embeddings[indices])
        output[-1] += bong[:,indices].dot(vecs)
    return np.hstack(output)
  return represent, None, True 
Example 7
Project: lexpredict-contraxsuite   Author: LexPredict   File: ngrams.py    GNU Affero General Public License v3.0 6 votes vote down vote up
def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8",
                                   tokenize_method=nltk.word_tokenize):
    """
    Get distribution of skipgrams with given n and k values from input_buffer.
    :param input_buffer:
    :param n:
    :param k:
    :param encoding:
    :param tokenize_method:
    :return:
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n)
    return nltk.util.skipgrams(ngrams, n, k) 
Example 8
Project: PyOpenDial   Author: KAIST-AILab   File: postprocessing.py    MIT License 6 votes vote down vote up
def ngram_replaser(info, reply, n=3):
    if info is None:
        return reply

    org_reply = reply

    info = re.split(r' *[\?\.\!][\'"\)\]]* *', info.strip().lower())
    reply = re.split(r' *[\?\.\!][\'"\)\]]* *', reply.strip().lower())

    info = sum([list(ngrams(i.split(), n=n)) for i in info if i], [])
    reply = sum([list(ngrams(r.split(), n=n)) for r in reply if r], [])

    phrases = []

    for i in info:
        for r in reply:
            if i == r:
                phrases.append(' '.join(r))

    replasments = equal_phrases(phrases)

    for o, r in zip(phrases, replasments):
        org_reply = org_reply.replace(o, r)

    return org_reply 
Example 9
Project: infodens   Author: ahmad-taie   File: bag_of_ngrams_features.py    GNU General Public License v3.0 6 votes vote down vote up
def hashNgram(self, listOfSentences, n, numberOfFeatures, finNgram=None):
        hasher = FeatureHasher(n_features=numberOfFeatures)

        def sentToNgram(listOfSentences):
            for sent in listOfSentences:
                sentDic = {}
                sentNgrams = Counter(ngrams(sent, n))
                for ngramElement in sentNgrams:
                    if finNgram:
                        if ngramElement in finNgram:
                            sentDic[str(ngramElement)] = sentNgrams[ngramElement]
                    else:
                        sentDic[str(ngramElement)] = sentNgrams[ngramElement]
                yield sentDic

        return hasher.transform(sentToNgram(listOfSentences)).tolil() 
Example 10
Project: allennlp-semparse   Author: allenai   File: atis_world.py    Apache License 2.0 6 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
Example 11
Project: allennlp-semparse   Author: allenai   File: atis_tables.py    Apache License 2.0 6 votes vote down vote up
def get_time_range_start_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    late_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "late"
    }

    time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_START_DICT.get(token.text, []):
            if token_index - 1 not in late_indices:
                time_range_start_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []):
            time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_start_linking_dict 
Example 12
Project: allennlp-semparse   Author: allenai   File: atis_tables.py    Apache License 2.0 6 votes vote down vote up
def get_time_range_end_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    early_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "early"
    }

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict 
Example 13
Project: flambe   Author: asappresearch   File: word.py    MIT License 6 votes vote down vote up
def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True) 
Example 14
Project: flambe   Author: asappresearch   File: word.py    MIT License 6 votes vote down vote up
def tokenize(self, example: str) -> List[str]:
        """Tokenize an input example.

        Parameters
        ----------
        example : str
            The input example, as a string.

        Returns
        -------
        List[str]
            The output word tokens, as a list of strings

        """
        if self.exclude_stopwords and self.stop_words:
            example = ' '.join([word for word in word_tokenize(example)
                                if word not in self.stop_words])

        if isinstance(self.ngrams, List):
            ret: List[str] = []
            for i in self.ngrams:
                ret.extend(self._tokenize(example, i))
            return ret
        else:
            return NGramsTokenizer._tokenize(example, self.ngrams) 
Example 15
Project: dl-with-constraints   Author: dair-iitd   File: atis_tables.py    MIT License 6 votes vote down vote up
def get_time_range_start_from_utterance(utterance: str, # pylint: disable=unused-argument
                                        tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    late_indices = {index for index, token in enumerate(tokenized_utterance)
                    if token.text == 'late'}

    time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_START_DICT.get(token.text, []):
            if token_index - 1 not in late_indices:
                time_range_start_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_START_DICT.get(' '.join(bigram), []):
            time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_start_linking_dict 
Example 16
Project: dl-with-constraints   Author: dair-iitd   File: atis_tables.py    MIT License 6 votes vote down vote up
def get_time_range_end_from_utterance(utterance: str, # pylint: disable=unused-argument
                                      tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    early_indices = {index for index, token in enumerate(tokenized_utterance)
                     if token.text == 'early'}

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(' '.join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict 
Example 17
Project: chequeabot   Author: chequeado   File: feature_extractors.py    MIT License 6 votes vote down vote up
def automatic_feature_extractor(spacy_tag, pos_ngrams=False):
    features = {}

    for tagged_word in spacy_tag:
        #pos, lemma, text, tag, dep ,is_punct, like_num, tense
        if tagged_word['is_punct'] and tagged_word['lemma'].encode('utf8') not in "%¿?":
            continue

        features[tagged_word['pos']] = True
        features[tagged_word['lemma']] = True
        features[tagged_word['dep']] = True
        features[tagged_word['tense']] = True

        if is_int(tagged_word['lemma']):
            number_of_digits = len(str(tagged_word['lemma'].encode('utf8')))
            features['%s_digits' %number_of_digits] = True

    if pos_ngrams:        
        ctags_chain = [e['pos'] for e in spacy_tag]
        ngs = ngrams(ctags_chain, 3)
        for ng in ngs:
            features[ng] = True
   
    return features 
Example 18
Project: AutomaticEssayGrading   Author: SahilC   File: Features.py    MIT License 5 votes vote down vote up
def lexical_diversity(self,sentence):
        sents = " ".join(nltk.word_tokenize(sentence))

        unigrams = [ grams for grams in ngrams(sents.split(), 1)]
        bigrams = [ grams for grams in ngrams(sents.split(), 2)]
        trigram = [ grams for grams in ngrams(sents.split(), 3)]

#        self.unigrams_count = len([(item[0], unigrams.count(item)) for item in sorted(set(unigrams))])
        self.bigrams_count = len([(item, bigrams.count(item)) for item in sorted(set(bigrams))])
#        self.trigrams_count = len([(item, trigram.count(item)) for item in sorted(set(trigram))]) 
Example 19
Project: AutomaticEssayGrading   Author: SahilC   File: Features.py    MIT License 5 votes vote down vote up
def lexical_diversity(self,sentence):
        sents = " ".join(nltk.word_tokenize(sentence))

        unigrams = [ grams for grams in ngrams(sents.split(), 1)]
        bigrams = [ grams for grams in ngrams(sents.split(), 2)]
        trigram = [ grams for grams in ngrams(sents.split(), 3)]

#        self.unigrams_count = len([(item[0], unigrams.count(item)) for item in sorted(set(unigrams))])
        self.bigrams_count = len([(item, bigrams.count(item)) for item in sorted(set(bigrams))])
#        self.trigrams_count = len([(item, trigram.count(item)) for item in sorted(set(trigram))]) 
Example 20
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 5 votes vote down vote up
def extract_ngrams(sentences, stoplist, stemmer, language, n=2):
    """Extract the ngrams of words from the input sentences.

    Args:
        n (int): the number of words for ngrams, defaults to 2
    """
    concepts = []
    for i, sentence in enumerate(sentences):

        # for each ngram of words
        tokens = sent2stokens_wostop(sentence, stoplist, language)
        for j in range(len(tokens)-(n-1)):

            # initialize ngram container
            ngram = []

            # for each token of the ngram
            for k in range(j, j+n):
                ngram.append(tokens[k].lower())

            # do not consider ngrams containing punctuation marks
            marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
            if len(marks) > 0:
                continue

            # do not consider ngrams composed of only stopwords
            #stops = [t for t in ngram if t in stoplist]
            #if len(stops) == len(ngram):
                #continue

            # stem the ngram
            ngram = [stemmer.stem(t) for t in ngram]

            # add the ngram to the concepts
            concepts.append(' '.join(ngram))
    return concepts 
Example 21
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 5 votes vote down vote up
def prune_ngrams(ngrams, stoplist, N=2):
    pruned_list = []
    for ngram in ngrams:
        items = ngram.split(' ')
        i = 0
        for item in items:
            if item in stoplist: i += 1
        if i < N:
            pruned_list.append(ngram)
    return pruned_list 
Example 22
Project: ijcai2019-relis   Author: UKPLab   File: ner_rewarder.py    MIT License 5 votes vote down vote up
def grouped_reward(self, dataset, topic, docs, summaries):
        ner, tags = parse_ner_chunk_grouped(dataset, topic)
        self.sentences = []
        for doc in docs:
            self.sentences.extend(doc[1])

        rewards = []
        longest_chunk_per_tag = [max([0]+[len(chunk) for chunk in ner[group].keys()]) for group in tags]
        for summary in summaries:
            text = ''
            for i in summary:
                text += self.sentences[i]
            token_summary = sent2tokens(text, LANGUAGE)
            reward = []
            for i, tag in enumerate(tags):
                sum_ner = {word:0 for word in ner[tag].keys()}
                for length in range(1, longest_chunk_per_tag[i]+1):
                    ngram_summary = ngrams(token_summary, length)
                    for token in ngram_summary:
                        if token in ner[tag]:
                            sum_ner[token] = sum_ner[token]+1
                if len(sum_ner) == 0:
                    tfifd = 0
                else:
                    tfidf = sum([sum_ner[entity] / float(len(token_summary)) * math.log(1/float(ner[tag][entity]))
                             for entity in sum_ner])/float(len(sum_ner))
                reward.append(tfidf)
            rewards.append(reward)
        return rewards, tags 
Example 23
Project: ijcai2019-relis   Author: UKPLab   File: redundancy_rewarder.py    MIT License 5 votes vote down vote up
def __call__(self,summary_list,ns):
        summ_list = []
        for sum_idxs in summary_list:
            summary = []
            for idx in sum_idxs:
                summary.append(self.sentences[idx])
            summ_list.append(' '.join(summary))
        rewards = []
        for summ in summ_list:
            reward = []
            for n in ns:
                summ_ngram = list(ngrams(sent2tokens(summ, LANGUAGE), n))
                reward.append(len(set(summ_ngram))/float(len(summ_ngram)))
            rewards.append(reward)
        return rewards 
Example 24
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    MIT License 5 votes vote down vote up
def n_gram_analysis_simple(infile, gram, stop):
	ngram = dict()
	f = open(infile, "r" )
	#f2 = codecs.open(outfile, "w+", "utf-8")
	for l in f:
	    x = nltk.ngrams(l.split(),gram)
	    for w in x:
	    	# if stop:
	    	# 	if w not in stops:
			   #      if w in ngram:
			   #          ngram[w]+=1
			   #      else:
			   #      	ngram[w]=1
			if w in ngram:
				ngram[w] += 1
			else:
				ngram[w] = 1
	p = list(ngram.items())
	p.sort(key = lambda x: -x[1])
	print len(p)
	for x in p[:10]:
		sen = ' '.join(x[0])
		cnt = int(x[1])
		if cnt == 0:
			cnt = 1
		print sen, cnt 
Example 25
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 26
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 27
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def transform(self, X: dt.Frame):
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams))
                text2 = text2_arr[ind]
                text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams))
                output.append(len(text1.intersection(text2)))
            except:
                output.append(-1)
        return np.array(output) 
Example 28
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 29
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 30
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, ngrams, **kwargs):
        super().__init__(**kwargs)
        self.ngrams = ngrams 
Example 31
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def get_parameter_choices():
        return {"ngrams": [1, 2, 3]} 
Example 32
Project: driverlessai-recipes   Author: h2oai   File: text_similarity_transformers.py    Apache License 2.0 5 votes vote down vote up
def transform(self, X: dt.Frame):
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = set(nltk.ngrams(str(text1).lower().split(), self.ngrams))
                text2 = text2_arr[ind]
                text2 = set(nltk.ngrams(str(text2).lower().split(), self.ngrams))
                output.append((2 * len(text1.intersection(text2))) / (len(text1) + len(text2)))
            except:
                output.append(-1)
        return np.array(output) 
Example 33
Project: ewe_ebooks   Author: jaymcgrath   File: bookstopher.py    MIT License 5 votes vote down vote up
def __init__(self, body, author='Anonymous'):

        # accumulators
        hashtags = []

        # Now process cleaned up text with NLTK
        words = []
        bigrams = []
        trigrams = []
        quadgrams = []
        sentences = []


        words = word_tokenize(body)

        sentences.extend(sent_tokenize(body))

        # Strip whitespace from each sentence
        sentences = [sentence.strip() for sentence in sentences]

        bigrams = ngrams(body, 2)
        trigrams = ngrams(body, 3)
        quadgrams = ngrams(body, 4)

        self.body = body
        self.words = words
        self.bigrams = bigrams
        self.trigrams = trigrams
        self.quadgrams = quadgrams
        self.sentences = sentences
        self.hashtags = hashtags
        self.author = author

        #TODO: Create "hashtags" from arbitrary number of rarest words 
Example 34
Project: screaming-frog-shingling   Author: jroakes   File: sf_shingling.py    MIT License 5 votes vote down vote up
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value) 
Example 35
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 5 votes vote down vote up
def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob 
Example 36
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 5 votes vote down vote up
def log_prob(self, words, N):
        assert N in self.counts, "You do not have counts for {}-grams".format(N)

        if N > len(words):
            err = "Not enough words for a gram-size of {}: {}".format(N, len(words))
            raise ValueError(err)

        total_prob = 0
        for ngram in nltk.ngrams(words, N):
            total_prob += self._log_ngram_prob(ngram)
        return total_prob 
Example 37
Project: textkit   Author: learntextvis   File: ngrams.py    MIT License 5 votes vote down vote up
def words2ngrams(sep, num, tokens):
    '''Convert word tokens into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''

    content = read_tokens(tokens)
    ngrams = list(nltk.ngrams(content, num))
    write_csv(ngrams, str(sep)) 
Example 38
Project: textkit   Author: learntextvis   File: ngrams.py    MIT License 5 votes vote down vote up
def text2ngrams(sep, num, text):
    '''Tokenize plain text into ngrams. ngrams are n-length word tokens.
    Punctuation is considered as a separate token.'''
    content = '\n'.join([open(f).read() for f in text])
    try:
        tokens = nltk.word_tokenize(content)
        ngrams = list(nltk.ngrams(tokens, num))
        write_csv(ngrams, str(sep))
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err) 
Example 39
Project: ngrambot   Author: jmcgover   File: ngram.py    GNU General Public License v3.0 5 votes vote down vote up
def build_ngrams(tokens, low, high):
    LOGGER.debug("Building ngrams from %d to %d" % (low, high))
    assert low <= high
    assert low > 0
    grams = {}
    for n in range(low, high + 1):
        grams[n] = [g for g in ngrams(tokens, n)]
    return grams 
Example 40
Project: ngrambot   Author: jmcgover   File: ngram.py    GNU General Public License v3.0 5 votes vote down vote up
def build_pos_ngrams(tagged, low, high):
    LOGGER.debug("Building POS ngrams from %d to %d" % (low, high))
    assert low <= high
    assert low > 0
    pos_tokens = []
    pos_words = defaultdict(list)
    for word, pos in tagged:
        pos_tokens.append(pos)
        pos_words[pos].append(word)
    grams = {}
    for n in range(low, high + 1):
        grams[n] = [g for g in ngrams(pos_tokens, n)]
    return grams, pos_words 
Example 41
Project: ALaCarte   Author: NLPrinceton   File: alacarte.py    MIT License 5 votes vote down vote up
def read_ngrams(self, tokens):
    '''reads tokens and updates context vectors
    Args:
      tokens: list of strings
    Returns:
      None
    '''

    import nltk

    # gets location of target n-grams in document
    target_vocab = self.target_vocab
    max_n = self.max_n
    ngrams = dict()
    for n in range(1, max_n + 1):
      ngrams[n] = list(filter(lambda entry: entry[1] in target_vocab, enumerate(nltk.ngrams(tokens, n))))

    for n in range(1, max_n + 1):
      if ngrams[n]:

        # gets word embedding for each token
        w2v = self.w2v
        zero_vector = self.zero_vector
        wnd = self.wnd
        start = max(0, ngrams[n][0][0] - wnd)
        vectors = [None] * start + [w2v.get(token, zero_vector) if token else zero_vector for token in
                                    tokens[start:ngrams[n][-1][0] + n + wnd]]
        c2v = self.c2v
        target_counts = self.target_counts

        # computes context vector around each target n-gram
        for i, ngram in ngrams[n]:
          c2v[ngram] += sum(vectors[max(0, i - wnd):i], zero_vector) + sum(vectors[i + n:i + n + wnd],
                                                                           zero_vector)
          target_counts[ngram] += 1 
Example 42
Project: ALaCarte   Author: NLPrinceton   File: cooc.py    MIT License 5 votes vote down vote up
def ngram_vocab(n):
  ngrams = lambda docs: {ngram for doc in tokenize(doc.lower() for doc in docs) for ngram in nltk.ngrams(doc, n)}
  return sorted(set.union(*(ngrams(sst_fine(partition)[0]) for partition in ['train', 'test'])))
  vocabulary = set.union(*(ngrams(task()[0]) for task in TASKMAP['cross-validation'].values()))
  for task in TASKMAP['train-test split'].values():
    for partition in ['train', 'test']:
      try:
        vocabulary = vocabulary.union(ngrams(task(partition)[0]))
      except FileNotFoundError:
        pass
  return sorted(vocabulary) 
Example 43
Project: lexpredict-contraxsuite   Author: LexPredict   File: chunk_similarity_task.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def build_vocabulary(self,
                         get_textset,
                         chunk_size: int,
                         total: int) -> List[str]:
        """

        :param get_textset: query that return text data - strings, probably, multilines
        :param chunk_size: chunk size - count of string objects to read at once
        :param total: total records to read
        :return: sorted list of unique ngrams
        """
        term_by_doc = {}  # type:Dict[str, int]
        start = 0
        while start < total:
            end = start + chunk_size
            end = min(end, total)
            texts_set = get_textset(start, end)  # type: List[str]
            ngrams = self.get_ngrams(texts_set)
            for ngram in ngrams:
                if ngram in term_by_doc:
                    term_by_doc[ngram] = term_by_doc[ngram] + 1
                else:
                    term_by_doc[ngram] = 1
            start = end
            self.task.push()

        # filter by min_df / max_df
        un_count = self.units_count if self.search_similar_text_units else self.documents_count
        up_margin = math.floor(self.max_df * un_count)
        lw_margin = self.min_df if type(self.min_df) is int else math.ceil(self.min_df * un_count)
        key_list = []  # type: List[str]
        for key in term_by_doc:
            count = term_by_doc[key]
            if count < lw_margin or count > up_margin:
                continue
            key_list.append(key)
        key_list.sort()
        return key_list 
Example 44
Project: lexpredict-contraxsuite   Author: LexPredict   File: chunk_similarity_task.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def build_matrices(self,
                       vocabulary: List[str],
                       get_textset,
                       chunk_size: int,
                       total: int) -> List[csr_matrix]:
        """
        Calculate terms (ngrams) - document matrices for all documents or
        text units, reading their text from DB by chunks
        :param vocabulary: list of unique terms, sorted
        :param get_textset: query that return text data - strings, probably, multilines
        :param chunk_size: chunk size - count of string objects to read at once
        :param total: total records to read
        :return: term distribution matrices
        """
        ngram_range = (1, 3,) if self.term_type == self.TERM_TYPE_WORD_3GRAM \
            else (1, 1,) if self.term_type == self.TERM_TYPE_WORDS \
            else (self.char_ngrams_length, self.char_ngrams_length,)
        analyzer = 'char' if self.term_type == self.TERM_TYPE_CHAR_NGRAM else 'word'
        model = TfidfVectorizer(
            ngram_range=ngram_range,
            analyzer=analyzer,
            stop_words='english',
            vocabulary=vocabulary,
            use_idf=self.use_idf)

        dtm_chunked = []
        start = 0
        while start < total:
            end = start + chunk_size
            end = min(end, total)
            texts_set = get_textset(start, end)  # type:List[str]
            dtm_chunked.append(model.fit_transform(texts_set))
            start = end
            self.task.push()

        return dtm_chunked 
Example 45
Project: lexpredict-contraxsuite   Author: LexPredict   File: ngrams.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def get_character_ngram_distribution(input_buffer, n=1, encoding="utf-8"):
    """
    Get distribution of character ngrams from input_buffer.
    :param input_buffer: input buffer
    :param n: n value, number of consecutive items
    :param encoding: default encoding
    """
    # Ensure we have a decoded string
    if isinstance(input_buffer, bytes):
        input_buffer = input_buffer.decode(encoding)

    # Convert to character ngrams
    ngrams = list(nltk.ngrams(input_buffer, n=n))
    return dict([(g, ngrams.count(g)) for g in set(ngrams)]) 
Example 46
Project: LobbyTrack   Author: regardscitoyens   File: lobbyTrack.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def getNGrams(raw_string, gram_nb):
    xgrams = ngrams(raw_string.split(), gram_nb)
    return xgrams 
Example 47
Project: duorc   Author: duorc   File: question_parser_lucene2.py    MIT License 5 votes vote down vote up
def get_continuous_chunks(text, all_possible_ngrams, stop, split_into_commas=False):
  text = text.lower()
  text = unicodedata.normalize('NFKD', unicode(text, "utf-8")).encode('ascii','ignore')
  toks = nltk.word_tokenize(text)
  return_chunks = []
  return_chunks_tokenized = []
  if split_into_commas and "," in text:
        for chunk in text.split(","):
                return_chunks.append(chunk)
                chunk_toks = nltk.word_tokenize(chunk)
                return_chunks_tokenized.append(chunk_toks)   
  if all_possible_ngrams:
	ngrams = set([])
   	ngrams.update([x for x in toks if x not in stop and not isint(x)])
	ngrams.update([' '.join(list(x)) for x in nltk.bigrams(toks) if len(set(x)-stop)>0])
	for ngram_counter in range(3,6):	
	   	ngrams.update([' '.join(list(x)) for x in nltk.ngrams(toks,ngram_counter) if len(set(x)-stop)>0])
        return_chunks=ngrams
	return_chunks_tokenized=[x.split(" ") for x in ngrams]
  else: 	
  	postoks = nltk.pos_tag(toks)
  	tree = chunker.parse(postoks)
  	# print tree
	super_list = [w for w,t in tree.leaves()]

  	for subtree in tree.subtrees():
	    # print subtree
	    if subtree==tree:
	      continue
	    chunk_list = [x[0].strip() for x in subtree.leaves()]
	    chunk = ' '.join(chunk_list).strip()
	    if len(chunk)<=1:
	      continue
	    if chunk not in return_chunks:
	      if chunk not in return_chunks:
		      return_chunks.append(chunk)
		      return_chunks_tokenized.append(chunk_list)
    # values.add(chunk)

  return return_chunks, return_chunks_tokenized, toks 
Example 48
Project: atap   Author: foxbook   File: ngrams.py    Apache License 2.0 5 votes vote down vote up
def ngrams2(text, n=2):
    for sent in sent_tokenize(text):
        sent = word_tokenize(sent)
        for ngram in nltk_ngrams(sent, n):
            yield ngram 
Example 49
Project: text-shingles   Author: steven-s   File: shingles.py    MIT License 5 votes vote down vote up
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
        split_text = text.split()
        if len(split_text) < shingle_length:
            raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))

        self.minhash = []
        self.shingles = ngrams(split_text, shingle_length)

        for hash_seed in generate_random_seeds(minhash_size, random_seed):
            min_value = float('inf')
            for shingle in ngrams(split_text, shingle_length):
                value = mmh3.hash(' '.join(shingle), hash_seed)
                min_value = min(min_value, value)
            self.minhash.append(min_value) 
Example 50
Project: cso-classifier   Author: angelosalatino   File: syntacticmodule.py    Apache License 2.0 5 votes vote down vote up
def get_ngrams(self, concept):
        """ Function that returns n-grams of concept in reverse order (3,2, and 1)
        """
        for n in range(3, 0, -1):
            pos = 0
            for ng in ngrams(word_tokenize(concept, preserve_line=True), n):
                yield {"position": pos, "size": n, "ngram": ng}
                pos += 1 
Example 51
Project: semanticRetrievalMRS   Author: easonnie   File: document_analysis.py    MIT License 5 votes vote down vote up
def get_ngrams(terms, poss=None, n=1, included_tags=None, as_strings=True):
    """Returns a list of all ngrams from length 1 to n.
    """
    ngrams = [(s, e + 1)
              for s in range(len(terms))
              for e in range(s, min(s + n, len(terms)))]

    if poss is not None and included_tags is not None:  # We do filtering according to pos.
        # ngrampos = [(s, e + 1)
        #             for s in range(len(poss))
        #             for e in range(s, min(s + n, len(poss)))]

        filtered_ngram = []
        for (s, e) in ngrams:
            if any([poss[i] in included_tags for i in range(s, e)]):
                filtered_ngram.append((s, e))

        ngrams = filtered_ngram

    # Concatenate into strings
    if as_strings:
        ngrams = ['{}'.format(' '.join(terms[s:e])) for (s, e) in ngrams]

    return ngrams


# Open class words	Closed class words	Other
# ADJ	            ADP	                PUNCT
# ADV	            AUX	                SYM
# INTJ	            CCONJ	            X
# NOUN	            DET
# PROPN	            NUM
# VERB	            PART
#                   PRON
#                   SCONJ 
Example 52
Project: DiPS   Author: malllabiisc   File: submodular_funcs.py    Apache License 2.0 5 votes vote down vote up
def ngram_toks(sents, n=1):
    ntoks =[]
    for sent in sents:
        ntok = list(ngrams(sent.split(), n))
        newtoks = [tok for tok in ntok]
        ntoks+= newtoks
    return ntoks 
Example 53
Project: DiPS   Author: malllabiisc   File: distinct_metric.py    Apache License 2.0 5 votes vote down vote up
def ngram_toks(sents, n=1):
    ntoks =[]
    for sent in sents:
        ntok = list(ngrams(sent.split(), n))
        newtoks = [tok for tok in ntok]
        ntoks+= newtoks
    return ntoks 
Example 54
Project: infodens   Author: ahmad-taie   File: bag_of_ngrams_features.py    GNU General Public License v3.0 5 votes vote down vote up
def ngramArgumentCheck(self, args, ngramType):

        proc_train = ""
        proc_test = ""
        parser = argparse.ArgumentParser(description='Bag of ngrams args')
        parser.add_argument("-train", help="Path for file to build ngram vector from.",
                            type=str, default="")
        if ngramType != "plain":
            parser.add_argument("-proc_train", help="Path for POS/Lemma tagged train sentences.",
                                type=str, default="")
            parser.add_argument("-proc_test", help="Path for POS/lemma tagged test sentences.",
                                type=str, default="")
        parser.add_argument("-ngram", help="Order of ngram.",
                            type=int, default=1)
        parser.add_argument("-cutoff", help="Min. Cutoff for ngram.",
                            type=int, default=1)
        parser.add_argument("-hash_size", help="Size of output vector from hashing.",
                            type=int, default=None)  # Default is no hashing

        argsOut = parser.parse_args(args.split())
        if ngramType != "plain":
            proc_train = argsOut.proc_train
            proc_test = argsOut.proc_test

        return argsOut.ngram, argsOut.cutoff, argsOut.hash_size,\
               argsOut.train, proc_train, proc_test 
Example 55
Project: infodens   Author: ahmad-taie   File: bag_of_ngrams_features.py    GNU General Public License v3.0 5 votes vote down vote up
def extractNgram(self, listOfSentences, n, numberOfFeatures, finNgram):

        ngramFeatures = sparse.lil_matrix((len(listOfSentences), numberOfFeatures))
        for i in range(len(listOfSentences)):
            ngramsVocab = Counter(ngrams(listOfSentences[i], n))
            lenSent = len(ngramsVocab)

            for ngramEntry in ngramsVocab:
                ## Keys
                ngramIndex = finNgram.get(ngramEntry, -1)
                if ngramIndex >= 0:
                    ngramFeatures[i, ngramIndex] = round((float(ngramsVocab[ngramEntry]) / lenSent), 2)

        return ngramFeatures 
Example 56
Project: infodens   Author: ahmad-taie   File: surprisal_features.py    GNU General Public License v3.0 5 votes vote down vote up
def ngramArgCheck(self, argString):

        parser = argparse.ArgumentParser(description='Ngram quantile surprisal args')
        parser.add_argument("-ngram", help="Order of ngrams.",
                            type=int, default=1)
        parser.add_argument("-cutoff", help="cuttoff frequency",
                            type=int, default=1)
        parser.add_argument("-n_quantiles", help="number of quantiles",
                            type=int, default=4)
        argsOut = parser.parse_args(argString.split())

        return argsOut.ngram, argsOut.cutoff, argsOut.n_quantiles 
Example 57
Project: infodens   Author: ahmad-taie   File: surprisal_features.py    GNU General Public License v3.0 5 votes vote down vote up
def getQuantiles(self, listOfSentences, n, quantile, finNgram):

        ngramFeatures = sparse.lil_matrix((len(listOfSentences), quantile + 1))

        for i in range(len(listOfSentences)):
            ngramsVocab = Counter(ngrams(listOfSentences[i], n))
            lenSent = 0
            for ngramEntry in ngramsVocab:
                ## Keys
                ngramIndex = finNgram.get(ngramEntry, -1)
                if ngramIndex >= 0:
                    ngramIndex -= 1
                    toAdd = ngramsVocab[ngramEntry]
                    ngramFeatures[i, ngramIndex] += toAdd
                    lenSent += toAdd
                else:
                    # OOV word (cut-off
                    toAdd = ngramsVocab[ngramEntry]
                    ngramFeatures[i, -1] += toAdd
                    lenSent += toAdd

            if lenSent:
                for j in range(0, quantile+1):
                    ngramFeatures[i, j] /= lenSent

        return ngramFeatures 
Example 58
Project: infodens   Author: ahmad-taie   File: preprocess_services.py    GNU General Public License v3.0 5 votes vote down vote up
def buildNgrams(self, n, freq, tokens, indexing=True):
        """Build and return ngrams from given tokens."""
        ngramsDict = defaultdict(int)
        for sent in tokens:
            ngramsList = list(nltk.ngrams(sent, n))
            for anNgram in ngramsList:
                ngramsDict[anNgram] += 1

        return self.ngramMinFreq(ngramsDict, freq, indexing) 
Example 59
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_baseline_features.py    MIT License 5 votes vote down vote up
def get_ngram_list(tknzr, text, n):
    tokens = tknzr.tokenize(text)
    tokens = [t for t in tokens if not t.startswith('#')]
    tokens = [t for t in tokens if not t.startswith('@')]
    ngram_list = [gram for gram in ngrams(tokens, n)]
    return ngram_list 
Example 60
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_ml_features.py    MIT License 5 votes vote down vote up
def get_ngrams(tokens, n, syntactic_data=False):
    if len(n) < 1:
        return {}
    if not syntactic_data:
        filtered = []
        stopwords = data_proc.get_stopwords_list()
        for t in tokens:
            if t not in stopwords and t.isalnum():
                filtered.append(t)
        tokens = filtered
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = str(i) + '-gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 16 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) 
Example 61
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_statistical_features.py    MIT License 5 votes vote down vote up
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False):
    if len(n) < 1:
        return {}
    if not for_semantics:
        if stem:
            porter = PorterStemmer()
            tokens = [porter.stem(t.lower()) for t in tokens]
        if use_just_words:
            tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#')
                      and t not in string.punctuation]
    ngram_tokens = []
    for i in n:
        for gram in ngrams(tokens, i):
            string_token = 'gram '
            for j in range(i):
                string_token += gram[j] + ' '
            ngram_tokens.append(string_token)
    ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)}
    return ngram_features


# Get sentiment features -- a total of 18 features derived
# Emoji features: a count of the positive, negative and neutral emojis
# along with the ratio of positive to negative emojis and negative to neutral
# Using the MPQA subjectivity lexicon, we have to check words for their part of speech
# and obtain features: a count of positive, negative and neutral words, as well as
# a count of the strong and weak subjectives, along with their ratios and a total sentiment words.
# Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features) 
Example 62
Project: tensorflow-nlp-examples   Author: Hironsan   File: preprocessing.py    MIT License 5 votes vote down vote up
def generate_ngrams(self, words, n=7):
        res = []
        seqlen = len(words)
        for i in range(1, n + 1):
            for ngram in ngrams(range(seqlen), i):
                l, r = ngram[0], ngram[-1] + 1
                res.append((l, r))
        return res 
Example 63
Project: ambientsearch   Author: bmilde   File: keyword_extract.py    Apache License 2.0 5 votes vote down vote up
def getKeywordsDruid(self, tokens):

        keywords = defaultdict(int)
        keywords_pos = defaultdict(list)

        if len(self.keyword_dict) == 0:
            print 'Warning, no Druid cache found. Wont be able to detect keywords.'
            return []

        #Automatically tokenize strings if nessecary
        if type(tokens) is str or type(tokens) is unicode:
            tokens = nltk.word_tokenize(tokens)

        #Unigram to fourgram
        for x in xrange(1,5):
            seq = nltk.ngrams(tokens, x)
            for i,gram in enumerate(seq):
                search_gram = u' '.join(gram).lower()
                #We score and rank keywords heuristically here and modify the druid score a bit: common words get penalized more, multiwords get a better score
                if len(search_gram) > 2 and search_gram in self.keyword_dict:
                    gram_factor = 1.0
                    if search_gram in self.common_words or search_gram[:-1] in self.common_words:
                        penality_factor = 0.1
                    else:
                        penality_factor = 1.0
                    keywords[search_gram] += self.keyword_dict[search_gram]*(x*gram_factor)*penality_factor

                    for pos in xrange(i,x+i):
                        keywords_pos[pos] += [search_gram]

        #Print keywords_pos
        keywords = self.mergeKeywords(keywords, keywords_pos)
        keywords_sorted = sorted(keywords.items(), key=operator.itemgetter(1), reverse=True)
        # Normalize scores to be in the range of 0.0 - 1.0
        keywords_sorted_normalized = [(item[0],self.normalize_keywordscore(item[1])) for item in keywords_sorted]
        print 'keywords_sorted_normalized:',keywords_sorted_normalized

        return keywords_sorted_normalized

    #Build a dictionary of DRUID keywords. Input is basically a filelist with multiwordness scores for 1-4 grams produced from the algorithm. Numbers and stopwords are filtered, the rest is taken as is. 
Example 64
Project: Transferable-E2E-ABSA   Author: hsqmlzno1   File: utils.py    MIT License 5 votes vote down vote up
def set_wid(dataset, vocab, win=1):
    """
    set wid field for the dataset
    :param dataset: dataset
    :param vocab: vocabulary
    :param win: context window size, for window-based input, should be an odd number
    :return: dataset with field wid
    """
    n_records = len(dataset)
    for i in range(n_records):
        words = dataset[i]['words']
        lm_labels = []
        # set labels for the auxiliary language modeling task
        for w in words:
            lm_labels.append(vocab[w])
        dataset[i]['lm_labels'] = list(lm_labels)
        n_padded_words = win // 2
        pad_left = ['PADDING' for _ in range(n_padded_words)]
        pad_right = ['PADDING' for _ in range(n_padded_words)]
        padded_words = pad_left + words + pad_right
        # the window-based input
        win_input = list(ngrams(padded_words, win))
        assert len(win_input) == len(words)
        n_grams = []
        for t in win_input:
            n_grams.append(t)
        wids = [[vocab[w] for w in ngram] for ngram in n_grams]
        dataset[i]['wids'] = list(wids)
    return dataset 
Example 65
Project: flambe   Author: asappresearch   File: word.py    MIT License 5 votes vote down vote up
def _tokenize(example: str, n: int) -> List[str]:
        """Tokenize an input example using ngrams.

        """
        return list(" ".join(x) if len(x) > 1 else x[0] for x in ngrams(word_tokenize(example), n)) 
Example 66
Project: dl-with-constraints   Author: dair-iitd   File: atis_world.py    MIT License 5 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in ATIS_TRIGGER_DICT.get(' '.join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index,
                                                  index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == 'st':
            natural_language_key = f'st. {trigram[2]}'.lower()
        else:
            natural_language_key = ' '.join(trigram).lower()
        for string in ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index,
                                                  index + 1,
                                                  index + 2])
    return string_linking_scores 
Example 67
Project: algodb   Author: xkxx   File: stringmatching_ngrams.py    MIT License 5 votes vote down vote up
def string_match(stop_words, algo_names, corpus,  file_names):
    idx = 0
    maxN = max(len(name) for name in algo_names)

    detected = {}
    algo_total_frequency = {}
    for doc in corpus:
        tokens = re.split(r'\s+', doc)
        tokens = remove_stopwords(tokens, stop_words)

        algo_doc_frequency = {}
        for n in range(1, maxN+1):
            for ngram in ngrams(tokens, n):
                if ngram in algo_names:
                    name = " ".join(ngram)
                    if name not in algo_doc_frequency:
                        algo_doc_frequency[name] = 0
                    algo_doc_frequency[name] += 1
        
        for name, freq in algo_doc_frequency.items():
            if name not in algo_total_frequency:
                algo_total_frequency[name] = 0
            algo_total_frequency[name] += 1
        detected[file_names[idx]] = algo_doc_frequency
        idx += 1
    tf_idf(detected, algo_total_frequency)
    return detected 
Example 68
Project: eXposeDeepNeuralNetwork   Author: joshsaxe   File: features.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def ngrams_extract(string):
    if random.random() < SAMPLE_RATE:
        print '[*]',string
    l = list
    grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
    SIZE = 1024
    vec = zeros((SIZE,))
    for t in grams:
        vec[hash(t)%SIZE]+=1
    return log(vec+1.0) 
Example 69
Project: deepbond   Author: mtreviso   File: error_analysis.py    MIT License 4 votes vote down vote up
def ngram_importance(self, n=[1, 2, 3], k=10):
        # top k argmax P(yn=1 | w1..wn)
        # 		w1..wn
        # P(y | w1..wn) = p(w1..wn | yn=1) * p(yn) / p(w1 .. wn)
        # ---
        # p(w1 .. wn) = f(w1 .. wn) / t(w1 .. wn)
        # p(yn) = f(yn) / t(y)
        # p(w1..wn | yn=1) = f(w1 .. wn ^ yn=1) / f(yn=1)
        remove_period = lambda v: [x for x in v if x != '.']
        text = remove_period(self.unrolled_text)

        y = 1
        fy_g = FreqDist(self.golds)
        fy_p = FreqDist(self.preds)

        for n_ in n:
            grams = list(ngrams(text, n_, pad_left=True))
            fwy_g = FreqDist(list(zip(grams, self.golds)))
            fwy_p = FreqDist(list(zip(grams, self.preds)))
            pwy_g = lambda w, y: fwy_g[(w, y)] / fy_g[y]
            pwy_p = lambda w, y: fwy_p[(w, y)] / fy_p[y]

            logger.info('Top %d %d-gram before period: ' % (k, n_))
            vg = [(w, pwy_g(w, y)) for w in set(grams) if pwy_g(w, y) > 0]
            vg = sorted(vg, reverse=True, key=lambda x: x[1])[:k]

            vp = [(w, pwy_p(w, y)) for w in set(grams) if pwy_p(w, y) > 0]
            vp = sorted(vp, reverse=True, key=lambda x: x[1])[:k]

            logger.info('%32s | %32s |' % ('Gold', 'Pred'))
            logger.info('-' * 33 + '+-' + '-' * 33 + '+')
            for i in range(min(len(vp), len(vg))):
                wg = ' '.join(vg[i][0])
                wp = ' '.join(vp[i][0])
                logger.info('%32s | %32s |' % (wg, wp))
            logger.info('-' * 33 + '+-' + '-' * 33 + '+')

        # index, values = [], []
        # for w, p in sorted(vp, reverse=True, key=lambda x: x[1])[:k]:
        # index.append()
        # values.append(p)
        # logger.debug(' '.join(w))
        # df = pd.Series(values, index=index)
        # df.plot(kind='bar', logy=True)
        # plt.xlabel('Word')
        # plt.ylabel('Probability')
        # plt.title('P(y | x1 ... xn)')
        # plt.show() 
Example 70
Project: twirpbot   Author: cozpii   File: get_most_common_unigrams_in_tweets.py    GNU General Public License v3.0 4 votes vote down vote up
def get_most_common_unigrams_in_tweets(twitter, name, maxTweets):
    stopwords_english = stopwords.words('english')
    id_of_earliest_tweet = None
    count = 0
    list_of_tweets = []
    all_unigrams = []
    all_unigrams_without_noise = []
    all_stemmed_unigrams = []
    stemmer = PorterStemmer()
    new_statuses = []

    while count < maxTweets:
        if id_of_earliest_tweet <= 0:
            statuses = twitter.get_user_timeline(screen_name=name, count=200, tweet_mode='extended', include_rts='false', exclude_replies='false')

        else:
            statuses = twitter.get_user_timeline(screen_name=name, count=200, tweet_mode='extended', max_id=id_of_earliest_tweet - 1, include_rts='false', exclude_replies='false')

        new_statuses += statuses

        for tweet in statuses:
            count += 1
            list_of_tweets.append(tweet['id'])

        if(len(statuses)):
            id_of_earliest_tweet = sorted(list_of_tweets)[0]
        else:
            break

    for tweet in new_statuses:
        unigrams = word_tokenize(tweet['full_text'].lower())
        all_unigrams += unigrams

    for word in all_unigrams:
        stemmed_unigram = stemmer.stem(word)
        all_stemmed_unigrams.append(stemmed_unigram)

    for word in all_stemmed_unigrams:
        if word in stopwords_english or len(word) < 4 or 'http' in word or 'www' in word or '//' in word:
            continue
        else:
            all_unigrams_without_noise.append(word)

    all_unigrams_list = list(ngrams(all_unigrams_without_noise,1))
    frequencies = Counter(all_unigrams_list)
    for token,count in frequencies.most_common(10):
        #print token,count
        return token[0] 
Example 71
Project: twirpbot   Author: cozpii   File: get_most_common_unigrams_in_retweets.py    GNU General Public License v3.0 4 votes vote down vote up
def get_most_common_unigrams_in_retweets(twitter, name, maxTweets):
    stopwords_english = stopwords.words('english')
    id_of_earliest_tweet = None
    count = 0
    list_of_tweets = []
    all_unigrams = []
    all_unigrams_without_noise = []
    all_stemmed_unigrams = []
    stemmer = PorterStemmer()
    retweets = []

    while count < maxTweets:
        if id_of_earliest_tweet <= 0:
            statuses = twitter.get_user_timeline(screen_name=name, count=200, tweet_mode='extended', include_rts='true', exclude_replies='true')

        else:
            statuses = twitter.get_user_timeline(screen_name=name, count=200, tweet_mode='extended', max_id=id_of_earliest_tweet - 1, include_rts='false', exclude_replies='true')

        for tweet in statuses:
            if tweet['full_text'].strip().split()[0] == 'RT':
                retweets.append(tweet['full_text'])
            count += 1
            list_of_tweets.append(tweet['id'])

        if(len(statuses)):
            id_of_earliest_tweet = sorted(list_of_tweets)[0]
        else:
            break

    for tweet in retweets:
        unigrams = word_tokenize(tweet.lower())
        all_unigrams += unigrams

    for word in all_unigrams:
        stemmed_unigram = stemmer.stem(word)
        all_stemmed_unigrams.append(stemmed_unigram)

    for word in all_stemmed_unigrams:
        if word in stopwords_english or len(word) < 4 or 'http' in word or 'www' in word or '//' in word:
            continue
        else:
            all_unigrams_without_noise.append(word)

    all_unigrams_list = list(ngrams(all_unigrams_without_noise, 1))
    frequencies = Counter(all_unigrams_list)
    for token,count in frequencies.most_common(10):
        #print token,count
        return token[0] 
Example 72
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 4 votes vote down vote up
def extract_ngrams_count(sentences, stemmer, language, stoplist, N=2):
    '''
    YG:
    extract n-grams and count the appearance times of each n-gram
    :param sentences: the list of sentences, each sentence is a string
    :param stemmer:
    :param language:
    :param N:
    :return:

    example input : 'This is a foo bar sentence'
    output: {'this is' : 1, 'is a' : 1, 'a foo' : 1, ...}
    the output is a dictionary
    '''
    #TODO: I am not sure whether we should remove all stopwords or not; maybe try both settings
    ngrams_count_dic= {}
    for i, sentence in enumerate(sentences):

        # for each ngram of words
        #sent = re.sub('[-](,?\s)','\\1', sentence) #case where magister- has to be handled
        #tokens = sent2stokens_wostop(sentence,stemmer,stoplist,language)
        tokens = sent2stokens(sentence,stemmer,language)
        for j in range(len(tokens)-(N-1)):
            # initialize ngram container
            ngram = []

            # for each token of the ngram
            for k in range(j, j+N):
                ngram.append(tokens[k].lower())

            # do not consider ngrams containing punctuation marks
            marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
            if len(marks) > 0:
                continue

            # do not consider ngrams composed of only stopwords
            stops = [t for t in ngram if t in stoplist]
            if len(stops) == len(ngram):
                continue

            # stem the ngram
            #ngram = [stemmer.stem(t) for t in ngram]
            ngram = ' '.join(ngram)
            #print('ngram: '+repr(ngram))

            # add check whether this n-gram has already been contained in the n-grams list
            if ngram in ngrams_count_dic:
                ngrams_count_dic[ngram] = ngrams_count_dic[ngram] + 1
            else:
                ngrams_count_dic[ngram] = 1
    return ngrams_count_dic 
Example 73
Project: clickbait   Author: bhargaviparanjape   File: utility.py    MIT License 4 votes vote down vote up
def naive_bayes(analysis):	
	tags = []
	words = []
	deps_cc = []
	for sen in analysis["sentences"]:
		tags += sen['pos']
		words += sen['tokens']
		deps_cc += sen["deps_cc"]
	norm = normalize_title(tags, words)

	f1 = []	
	current = list(nltk.ngrams(norm.split(), 1)) + list(nltk.ngrams(norm.split(), 2)) + list(nltk.ngrams(norm.split(),3))
	ngram_list = [' '.join(list(g)) for g in current]
	for pos in common_grams:
		if pos in ngram_list:
			f1.append(1)
		else:
			f1.append(0)
	f1 = numpy.array(f1).reshape(1, len(f1))

	#pos ngrams
	f2 = []
	current_pos = list(nltk.ngrams(tags, 1)) + list(nltk.ngrams(tags, 2)) + list(nltk.ngrams(tags,3))
	ngram_list = [' '.join(list(g)) for g in current_pos]
	for pos in common_pos_grams:
		if pos in ngram_list:
			f2.append(1)
		else:
			f2.append(0)
	f2 = numpy.array(f2).reshape(1, len(f2))
	# print f2.shape


	# syntactic ngrams
	f3 = []
	current_sngrams = list(syntactic_n_gram(deps_cc, 1)) + list(syntactic_n_gram(deps_cc, 2)) + list(syntactic_n_gram(deps_cc, 3))
	ngram_list = [' '.join(list(g)) for g in current_sngrams]
	for pos in common_sn_grams:
		if pos in ngram_list:
			f3.append(1)
		else:
			f3.append(0)
	f3 = numpy.array(f3).reshape(1, len(f3))

	return [clf1.predict(f1)[0], clf2.predict(f2)[0], clf3.predict(f3)[0]] 
Example 74
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 4 votes vote down vote up
def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set([])

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.MLE(order=n)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self.n_words = n_words
        self._models = models
        self.n_tokens = len(vocab) if vocab is not None else len(tokens) 
Example 75
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 4 votes vote down vote up
def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set()

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self._models = models
        self.n_words = n_words
        self.n_tokens = len(vocab) if vocab is not None else len(tokens) 
Example 76
Project: lexpredict-contraxsuite   Author: LexPredict   File: chunk_similarity_task.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def __init__(self,
                 task: ExtendedTask,  # task object to log messages and report progress
                 should_delete: bool = True,  # delete existing DocumentSimilarity entries
                 project_id: Optional[int] = None,  # optional project filter
                 search_similar_documents: bool = True,  # we either search for similar documents...
                 search_similar_text_units: bool = False,  # ... or TextUnit-s
                 # min "correlation" to consider 2 documents (or TextUnit-s) similar
                 similarity_threshold: int = 75,
                 # should we use Inverse Document Frequency to obtain (sometimes) more precise results?
                 use_idf: bool = False,
                 # process text as character ngrams, just words or word ngrams
                 term_type: str = 'WORDS',
                 # character count in character ngrams (if term_type is 'CHAR_NGRAMS')
                 char_ngrams_length: int = 6,
                 ignore_case: bool = True):
        self.task = task
        self.project_id = project_id
        self.should_delete = should_delete
        self.similarity_threshold = similarity_threshold
        self.search_similar_documents = search_similar_documents
        self.search_similar_text_units = search_similar_text_units
        self.use_idf = use_idf
        self.term_type = term_type
        self.char_ngrams_length = char_ngrams_length
        self.ignore_case = ignore_case

        # min term frequency, integer value is for absolute occurrence count,
        # float value is for relative occurrence (per total entries count)
        # Used while building vocabulary
        self.min_df = 2
        # same as min_df, but for upper limit
        self.max_df = 0.5
        # documents count, used when search_similar_documents is True
        self.documents_count = 0
        # text units count, used when search_similar_text_units is True
        self.units_count = 0
        # buffer to accumulate storing DocumentSimilarity items for bulk insert operation
        self.docsim_store_buffer = []  # type:List[DocumentSimilarity]
        # buffer to accumulate storing TextUnitSimilarity items for bulk insert operation
        self.unsim_store_buffer = []  # type:List[TextUnitSimilarity]
        # flush buffer when it reaches the limit
        self.store_buf_flush_count = 1000
        # used for logging time spent for each stage of the task's calculations
        self.timings = []  # type:List[Tuple[str, datetime.datetime]]
        # used when search_similar_documents is True - all documents or
        # just the documents of the specified project
        self.doc_query = Document.objects.all() if not self.project_id \
            else Document.objects.filter(project_id=self.project_id)
        # used when search_similar_text_units is True - all text units or
        # just the text units of the specified project
        self.text_unit_query = None
        if search_similar_text_units:
            filters = dict(unit_type='paragraph', textunittext__text__regex=self.unit_text_regex)
            if self.project_id:
                filters['document__project_id'] = project_id
            self.text_unit_query = TextUnit.objects.filter(**filters) 
Example 77
Project: lexpredict-contraxsuite   Author: LexPredict   File: chunk_similarity_task.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def get_ngrams(self, texts: List[str]) -> Iterable[str]:
        """
        Make 3-word corteges out of words' list if self.term_type is 'WORD_3GRAMS'
        return words from text if self.term_type == 'WORDS'
        else return ngrams from characters 'CHAR_NGRAMS'
        :param texts: words' list
        :return: list of 1...3-word corteges: {'word_1', 'word_1 word_2', 'word_1 word_2 word_3', 'word_4', ...}
        """
        all_ngrams = []

        if self.term_type == self.TERM_TYPE_CHAR_NGRAM:
            for text in texts:
                word_set = set()  # type: Set[str]
                wrd = ''
                for c in text:
                    wrd += c
                    if len(wrd) > self.char_ngrams_length:
                        wrd = wrd[1:]
                    word_set.add(wrd)
                for ngram in word_set:
                    all_ngrams.append(ngram)
            return all_ngrams

        if self.term_type == self.TERM_TYPE_WORDS:
            all_words = []
            for text in texts:
                word_set = set()  # type: Set[str]
                for wrd in self.reg_wordsplit.split(text):
                    if wrd:
                        word_set.add(wrd)
                for wrd in word_set:
                    all_words.append(wrd)
            return all_words

        # if self.term_type == self.TERM_TYPE_WORD_3GRAM:
        # This EOF is here because we produce ngrams on a bunch of documents in a
        # single time. So when we will produce our vocabulary after that we want to
        # be able to avoid associating terms that are not really associated.
        for text in texts:
            allterms = self.reg_wordsplit.split(text)
            ngram_set = set()  # type:Set[str]
            # create 1-grams, 2-grams and 3-grams and zip() them all.
            for g in zip(ngrams(allterms, 1), ngrams(allterms, 2), ngrams(allterms, 3)):
                for w in map(lambda wrd: ' '.join(wrd), g):
                    ngram_set.add(w)
            for w in ngram_set:
                all_ngrams.append(w)
        return all_ngrams 
Example 78
Project: allennlp-semparse   Author: allenai   File: atis_tables.py    Apache License 2.0 4 votes vote down vote up
def get_date_from_utterance(tokenized_utterance: List[Token], year: int = 1993) -> List[datetime]:
    """
    When the year is not explicitly mentioned in the utterance, the query assumes that
    it is 1993 so we do the same here. If there is no mention of the month or day then
    we do not return any dates from the utterance.
    """

    dates = []

    utterance = " ".join([token.text for token in tokenized_utterance])
    year_result = re.findall(r"199[0-4]", utterance)
    if year_result:
        year = int(year_result[0])
    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for month, tens, digit in trigrams:
        # This will match something like ``september twenty first``.
        day = " ".join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print("invalid month day")

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for month, day in bigrams:
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            # This will match something like ``september first``.
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print("invalid month day")

    fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
    for tens, digit, _, year_match, month in fivegrams:
        # This will match something like ``twenty first of 1993 july``.
        day = " ".join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print("invalid month day")
        if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
            except ValueError:
                print("invalid month day")
    return dates 
Example 79
Project: tensorflow-nlp-examples   Author: Hironsan   File: preprocessing.py    MIT License 4 votes vote down vote up
def transform(self, X, y=None):
        """Transform documents to document ids.

        Uses the vocabulary learned by fit.

        Args:
            X : iterable
            an iterable which yields either str, unicode or file objects.
            y : iterabl, label strings.

        Returns:
            features: document id matrix.
            y: label id matrix.
        """
        mentions = []
        mentions_char = []
        left_contexts = []
        right_contexts = []
        outputs = []

        word_ids = [self._word_vocab.doc2id(doc) for doc in X]
        char_ids = [[self._char_vocab.doc2id(w) for w in doc] for doc in X]
        ngram_indices = []
        for sent in word_ids:
            ngrams = self.generate_ngrams(sent, n=4)
            ngram_indices.append(ngrams)
            for l, r in ngrams:
                mentions.append(word_ids[l: r])
                mentions_char.append(char_ids[l:r])
                left_contexts.append(word_ids[:l])
                right_contexts.append(word_ids[r:])

        if y is not None:
            for ngram, labels in zip(ngram_indices, y):
                d = {(begin_offset, end_offset + 1): t for t, begin_offset, end_offset in get_entities(labels)}
                for l, r in ngram:
                    if (l, r) in d:
                        outputs.append(self._label_vocab[d[(l, r)]])
                    else:
                        outputs.append(self._label_vocab)

        outputs = np.array(outputs)
        inputs = [np.array(left_contexts), np.array(mentions), np.array(mentions_char), np.array(right_contexts)]

        if y is not None:
            return inputs, outputs
        else:
            return inputs 
Example 80
Project: dl-with-constraints   Author: dair-iitd   File: atis_tables.py    MIT License 4 votes vote down vote up
def get_date_from_utterance(tokenized_utterance: List[Token],
                            year: int = 1993) -> List[datetime]:
    """
    When the year is not explicitly mentioned in the utterance, the query assumes that
    it is 1993 so we do the same here. If there is no mention of the month or day then
    we do not return any dates from the utterance.
    """

    dates = []

    utterance = ' '.join([token.text for token in tokenized_utterance])
    year_result = re.findall(r'199[0-4]', utterance)
    if year_result:
        year = int(year_result[0])
    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for month, tens, digit in trigrams:
        # This will match something like ``september twenty first``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for month, day in bigrams:
        if month in MONTH_NUMBERS and day in DAY_NUMBERS:
            # This will match something like ``september first``.
            try:
                dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')

    fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
    for tens, digit, _, year_match, month in fivegrams:
        # This will match something like ``twenty first of 1993 july``.
        day = ' '.join([tens, digit])
        if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
            except ValueError:
                print('invalid month day')
        if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
            try:
                dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
            except ValueError:
                print('invalid month day')
    return dates