Python nltk.pos_tag() Examples

The following are code examples for showing how to use nltk.pos_tag(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: emojipastifier   Author: bennissan   File: emojipastifier.py    GNU General Public License v3.0 6 votes vote down vote up
def add_emoji(word):
    if nltk.pos_tag([word])[0][1] in pos_to_filter:
        return word

    stem = stemmer.stem(word)
    (match_name, match_value) = fuzzy_match.extractOne(stem, emoji_names)
    # Arbitrary match accuracy threshold: high enough to remain accurate, but low enough to catch funny homophones.
    if match_value < 75:
        return word
    
    emoji_name = match_name
    emoji = emojize(emoji_name)
    # Checks if emoji is a flag and ignores unless matched exactly; there are too many flag emojis!
    if any(flag in emoji.encode("unicode-escape").decode("ASCII") for flag in flags_to_filter) and match_value != 100:
        return word

    # Repeat emoji between one and three times for that classic emojipasta flair.
    return word + " " + random.randint(1, 3) * emoji 
Example 2
Project: natural-language-processing   Author: master-vic   File: mathieu.py    GNU General Public License v3.0 6 votes vote down vote up
def __init__(self, sentence):
        self.sentence = sentence

        self.forms = []
        for s in tuple(open(FORMS, "r")):  # read the user_forms from file
            self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()])

        if self.is_valid():
            self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+")  # tokenizing with regex
            self.stop_words = set(stop.words("english"))  # filtering tokens words to remove
            self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words]  # remove stop words
            self.spell_checked = self.spell_check()
            self.tags = pos_tag(self.spell_checked, tagset="universal")  # speech tagging (identification)
            print(self.tags)
            self.digits = self.get_digits()
            self.user_form = self.get_user_form()

    # Get digits from the tags list 
Example 3
Project: isanlp   Author: IINemo   File: processor_postagger_nltk_en.py    MIT License 6 votes vote down vote up
def __call__(self, tokens, sentences):
        """Performs postagging.
        
        Args:
            tokens(list): List of Token objects.
            sentences(list): List of Sentence objects.
            
        Returns:
            List of lists (sentences) of strings that represent postag in 
            Penn Treebank format.
        """
        
        result = []
        for sent in sentences:
            result.append([e[1] for e in nltk.pos_tag([word.text for word in CSentence(tokens, sent)])])
            
        return result 
Example 4
Project: who-are-you   Author: PawelPamula   File: __init__.py    MIT License 6 votes vote down vote up
def tweets2tags(text, hasht):
    tx=[]
    for line in text:
        tokens=word_tokenize(line)
        tags=nltk.pos_tag(tokens)
        text= [s[0] for s in tags if s[1].startswith('NN')]
        tx.extend(text)
    vectorizer = TfidfVectorizer(stop_words="english",min_df=1)
    X = vectorizer.fit_transform(tx)
    idf = vectorizer.idf_
    size=len(idf)
    idf[:size/5]=2
    idf[size/5:2*size/5]=3
    idf[2*size/5:3*size/5]=4
    idf[3*size/5:4*size/5]=5
    idf[4*size/5:]=7
    tags =  dict(zip(vectorizer.get_feature_names(), idf))
    for i in hasht:
        tags[i] = 6
    return tags 
Example 5
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 6 votes vote down vote up
def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation or stopword, ignore token and continue
                if token in self.stopwords or all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma 
Example 6
Project: d-bert   Author: castorini   File: augment_paired_data.py    MIT License 6 votes vote down vote up
def generate(tags, pos_dict, random_prob, mask_prob, window_prob, window_lengths):
    gen_words = []
    for word, pos_tag in tags:
        roll = random.random()
        if roll < random_prob:
            gen_words.append(random.choice(pos_dict[pos_tag]))
        elif roll < mask_prob + random_prob:
            gen_words.append("[MASK]")
        else:
            gen_words.append(word)
    if random.random() < window_prob:
        window_len = random.choice(window_lengths)
        try:
            idx = random.randrange(len(gen_words) - window_len)
            gen_words = gen_words[idx:idx + window_len]
        except ValueError:
            pass
    gen_sent = " ".join(gen_words)
    return gen_sent 
Example 7
Project: d-bert   Author: castorini   File: preprocess_nltk.py    MIT License 6 votes vote down vote up
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_file", type=str)
    parser.add_argument("--text_col", type=str, default="sentence")
    parser.add_argument("--output_file", type=str)
    parser.add_argument("--pos_tag_only", action="store_true")
    args = parser.parse_args()

    df = pd.read_csv(args.dataset_file, sep="\t")
    columns = list(df.columns)
    pos_tags_lst = []
    for text in tqdm(df[args.text_col]):
        word_toks = text.split(" ") if args.pos_tag_only else nltk.word_tokenize(text)
        pos_tags_lst.append(nltk.pos_tag(word_toks))
    df[f"{args.text_col}_pos"] = [" ".join([tag[1] for tag in pos_tags]) for pos_tags in pos_tags_lst]
    if not args.pos_tag_only:
        df[args.text_col] = [" ".join([tag[0] for tag in pos_tags]) for pos_tags in pos_tags_lst]
    if args.output_file is None:
        args.output_file = args.dataset_file
    df.to_csv(args.output_file, index=False, sep="\t") 
Example 8
Project: AutomaticEssayGrading   Author: SahilC   File: Features.py    MIT License 6 votes vote down vote up
def pos_counts(self,tokens):
        tags = nltk.pos_tag(tokens)
        for tag in tags:
            if tag[1].startswith("NN"):
#                self.noun_count += 1
                pass
            elif tag[1].startswith("JJ"):
                self.adj_count += 1
            elif tag[1].startswith("RB"):
                self.adv_count += 1
            elif tag[1].startswith("VB"):
#                self.verb_count += 1
                pass
            elif tag[1].startswith("FW"):
                pass
#                self.fw_count += 1 
Example 9
Project: AutomaticEssayGrading   Author: SahilC   File: Features.py    MIT License 6 votes vote down vote up
def pos_counts(self,tokens):
        tags = nltk.pos_tag(tokens)
        for tag in tags:
            if tag[1].startswith("NN"):
#                self.noun_count += 1
                pass
            elif tag[1].startswith("JJ"):
                self.adj_count += 1
            elif tag[1].startswith("RB"):
                self.adv_count += 1
            elif tag[1].startswith("VB"):
#                self.verb_count += 1
                pass
            elif tag[1].startswith("FW"):
                pass
#                self.fw_count += 1 
Example 10
Project: self-attentive-parser   Author: nikitakit   File: nltk_plugin.py    MIT License 6 votes vote down vote up
def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence 
Example 11
Project: Automated-Essay-Grading   Author: vatika   File: feature_extractor.py    GNU General Public License v2.0 6 votes vote down vote up
def pos_features(self):
        '''
        parts of speech(noun adjectives verbs ....) counts
        '''
        s = {}
        freqs = [0 for i in xrange(0,len(pos_classes))]
        for sentence in self.essay_str.split('.'):
            try:
                tmp_tokens = nltk.word_tokenize(sentence)
                values = nltk.pos_tag(tmp_tokens)
                for v in values:
                    if v[1] in s:
                        s[v[1]] += 1
                    else:
                        s[v[1]] = 1
            except UnicodeDecodeError:
                continue
            except IndexError:
                continue
        for key,value in s.iteritems():
             for index, pos in enumerate(pos_classes):
                 if key in pos:
                     freqs[index] += value
                     break
        self.features.extend(freqs) 
Example 12
Project: sum-basic   Author: hardik-vala   File: sumbasic.py    MIT License 6 votes vote down vote up
def preprocess_sent(self, sent):
		"""
		Preprocesses a sentence by performing lowercasing, lemmatization, and
		stopword removal.

		@param sent - Sentence as a string
		@return Preprocessed sentence as a list of words
		"""

		lmtzr = WordNetLemmatizer()

		# Lowercase and tokenize words.
		words = word_tokenize(sent.lower())
		# Assign POS tags to words.
		words_pos = nltk.pos_tag(words)
		# Lemmatize.
		lm_words = []
		for w, p in words_pos:
			try:
				lm_words.append(lmtzr.lemmatize(w, pos=p[0].lower()))
			except KeyError:
				lm_words.append(lmtzr.lemmatize(w))

		# Remove stop words.
		return [w for w in lm_words if w not in self.STOPS] 
Example 13
Project: survey-mining   Author: BarbaraMcG   File: resofact_topic_extraction_27112016.py    GNU General Public License v3.0 6 votes vote down vote up
def lemmatize_text(string, wnl_var):
    tokens = nltk.word_tokenize(string)
    pos_tokens = nltk.pos_tag(tokens)
    # print "pos_tokens:" + str(pos_tokens)
    my_raw_lemmas = []
    for i in range(0, len(tokens)):
        token = tokens[i]
        pos = pos_tokens[i][1]
        # print "pos", pos, "token", token
        # my_lemma = wnl_var.lemmatize(token, pos='n')  # assume noun since that's what term algorithm returns
        if map_pos(pos) != 'other':
            my_lemma = wnl_var.lemmatize(token, map_pos(pos))
        else:
            my_lemma = wnl_var.lemmatize(token)

        my_raw_lemmas.append(my_lemma)
    # lemma_string = ' '.join(my_raw_lemmas)
    return my_raw_lemmas 
Example 14
Project: gender-bias   Author: gender-bias   File: document.py    MIT License 6 votes vote down vote up
def words_by_part_of_speech(self) -> dict:
        """
        Compute the parts of speech for each word in the document.

        Uses nltk.pos_tag.

        Returns:
            dict

        """
        words = self.words()
        tagged = nltk.pos_tag(words)
        categories = {}
        for type in {t[1] for t in tagged}:
            categories[type] = [t[0] for t in tagged if t[1] == type]
        return categories 
Example 15
Project: question_answering   Author: joswinkj   File: AnswerProcessing.py    Apache License 2.0 6 votes vote down vote up
def getBestAnswer(self,answer_type,method='first'):
        if method=='first':
            retWord=None
            for ind,ans in enumerate(self.answers_tag):
                for word,tag in ans:
                    if not retWord:
                        if tag==answer_type:
                            retInd = ind
                            retWord = word
                    else:
                        if ind==retInd and tag==answer_type:
                            retWord=retWord+' '+word
                        else:
                            return retWord,retInd
            return None,None
        elif method=='useRakeInd':
            print '###################Getting best answer#########################'
            #prob 1.2 billion...only 1.2 coming
            # pos_tag useful only for numbers, need stanforNER tagger
            # pdb.set_trace()
            ind_answ,ret_word_top = self.get_rake_based_answ(answer_type)
            return ind_answ,ret_word_top 
Example 16
Project: nltk-book-2nd   Author: East196   File: chunking.py    Apache License 2.0 6 votes vote down vote up
def process_content():
	for word in tokenized:
		words = nltk.word_tokenize(word)
		tagged = nltk.pos_tag(words)
		
		# Use regular expression for chunking
		# "Include an adverb followed by a verb if there are any.
		# Then, require a proper noun (i.e. "Steve") followed by a
		# noun (i.e. "desk") if there is one. 
		chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""

		chunkParser = nltk.RegexpParser(chunkGram)
		chunked = chunkParser.parse(tagged)

		#print(chunked)
		chunked.draw() 
Example 17
Project: regex4dummies   Author: DarkmatterVale   File: phrase_extractor.py    MIT License 6 votes vote down vote up
def extract_prepositional_phrases(self, **kwargs):
        """
        Returns the prepositional phrases found
        within a text.
        """

        # Instantiating variables
        prepositional_phrases = []
        prepositional_phrase_extractor = NLTK()

        # Getting prepositional phrases
        prepositional_phrases = prepositional_phrase_extractor.find_prepositional_phrases(kwargs.get("text"), kwargs.get("text"), nltk.pos_tag( nltk.word_tokenize(str(kwargs.get("text"))))).split('...')
        prepositional_phrases = prepositional_phrases[0 : len(prepositional_phrases) - 1]

        # Normalizing the phrases
        prepositional_phrases = self.normalize_text(prepositional_phrases)

        # Returning the found prepositional_phrases
        return prepositional_phrases 
Example 18
Project: domain_discovery_API   Author: VIDA-NYU   File: get_mtermvectors_multi_prov.py    GNU General Public License v3.0 5 votes vote down vote up
def pos_filter(pos_tags=['NN', 'NNS', 'NNP', 'NNPS', 'VBN', 'JJ'], docterms=[]):
    tagged = nltk.pos_tag(docterms)
    valid_words = [tag[0] for tag in tagged if tag[1] in pos_tags]
    return valid_words 
Example 19
Project: domain_discovery_API   Author: VIDA-NYU   File: get_mtermvectors.py    GNU General Public License v3.0 5 votes vote down vote up
def pos_filter(pos_tags=['NN', 'NNS', 'NNP', 'NNPS', 'VBN', 'JJ'], docterms=[]):
    tagged = nltk.pos_tag(docterms)
    valid_words = [tag[0] for tag in tagged if tag[1] in pos_tags]
    return valid_words 
Example 20
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    MIT License 5 votes vote down vote up
def pos_tagging(targets):
    while True:
        words = (yield)
        tagged_words = nltk.pos_tag(words)

        for target in targets:
            target.send(tagged_words) 
Example 21
Project: tensorflow-XNN   Author: ChenglongChen   File: main.py    MIT License 5 votes vote down vote up
def lemmatize_sentence(sentence):
    res = []
    sentence_ = get_valid_words(sentence)
    for word, pos in pos_tag(sentence_):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatize_word(word, pos=wordnet_pos))
    return res 
Example 22
Project: Snowball   Author: davidsbatista   File: Tuple.py    GNU General Public License v3.0 5 votes vote down vote up
def construct_words_vectors(self, words, config):
        # split text into tokens and tag them using NLTK's default English tagger
        # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        text_tokens = word_tokenize(words)
        tags_ptb = pos_tag(text_tokens)
        pattern = [t[0] for t in tags_ptb if
                   t[0].lower() not in config.stopwords and t[1] not in self.filter_pos]
        if len(pattern) >= 1:
            vect_ids = self.config.vsm.dictionary.doc2bow(pattern)
            return self.config.vsm.tf_idf_model[vect_ids] 
Example 23
Project: IDEA   Author: armor-ai   File: main.py    MIT License 5 votes vote down vote up
def generate_labeling_candidates(OLDA_input):
    """
    Filter phrase labels and choose for candidates
    :param OLDA_input:
    :return:
    """
    phrases = {}
    for apk, item in OLDA_input.items():
        dic, _, _1, _2, _3= item
        phrases[apk] = defaultdict(int)
        # filter bigram and trigram
        for word in dic.values():
            if '_' in word:
                phrase = word
                words, tags = zip(*nltk.pos_tag(phrase.split(b'_')))
                match = False
                for tag in tags:
                    if re.match(r"^NN", tag):
                        match = True
                        continue
                    if re.match(r"DT", tag):
                        match = False
                        break
                    if re.match(r"RB", tag):
                        match = False
                        break
                for word in words:
                    if word in stopwords.words('english') + my_stoplst:     # remove stop word
                        match = False
                        break
                    if len(word) < 3:
                        match = False
                        break
                    if "\\'" in word:
                        match = False
                        break
                if match:
                    # keep phrase
                    phrases[apk][phrase] = 1
    return phrases 
Example 24
Project: simple_np   Author: korobool   File: np_extractor.py    MIT License 5 votes vote down vote up
def _tokenize(text):
    return pos_tag(word_tokenize(text)) 
Example 25
Project: geograpy2   Author: Corollarium   File: extraction.py    MIT License 5 votes vote down vote up
def named_entities(self):
        # word_tokenize should work well for most non-CJK languages
        text = nltk.word_tokenize(self.text)
        
        # TODO: this works only for english. Stanford's pos tagger supports
        # more languages
        # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        # http://stackoverflow.com/questions/1639855/pos-tagging-in-german
        # PT corpus http://aelius.sourceforge.net/manual.html
        # 
        pos_tag = nltk.pos_tag(text)
        
        nes = nltk.ne_chunk(pos_tag)
        return nes 
Example 26
Project: who-are-you   Author: PawelPamula   File: tst.py    MIT License 5 votes vote down vote up
def tweets2tags(text,hasht):
        tx=[]
        for line in text:
            tokens=word_tokenize(line)
                tags=nltk.pos_tag(tokens)

                text= [s[0] for s in tags if s[1].startswith('NN')]
                tx.extend(text) 
Example 27
Project: EliIE   Author: Tian312   File: POS.py    MIT License 5 votes vote down vote up
def pos_tagging(term_list):
    #print term_list
    term_list_new=list()
    for term in term_list:
        term=term.decode('utf-8', 'ignore')
        term_list_new.append(term)
    tag=nltk.pos_tag(term_list_new)
    t=list()
    for ta in tag:
       t.append(ta[1])
    return t 
Example 28
Project: chowmein   Author: xiaohan2012   File: corpus_processor.py    MIT License 5 votes vote down vote up
def __init__(self, pos_tag_func=nltk.pos_tag):
        """
        Parameter:
        --------------
        pos_tag_func: pos_tag function that accepts list of tokens
            and POS tag them
        """
        self._pos_tag_func = pos_tag_func 
Example 29
Project: pyontutils   Author: tgbugs   File: nltklib.py    MIT License 5 votes vote down vote up
def get_tokenized_sentence(sentence):
    # Tokenize and tag
    sentence = pos_tag(word_tokenize(sentence))
    # Get the synsets for the tagged words
    synsets = []
    for tagged_word in sentence:
        synset = tagged_to_synset(*tagged_word)
        if synset:
            synsets.append(synset)
        else:
            synsets.append(tagged_word[0])
    return synsets # str(sorted(synsets)) 
Example 30
Project: DeepLearn   Author: GauravBh1010tt   File: rd_ft.py    MIT License 5 votes vote down vote up
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return nouns

#Average Edit Distance Value For Two String And The Average Edit Distance Between The Nouns Present In Them(Returns Float) 
Example 31
Project: QAUniBonn   Author: jtrillos   File: nerQuestion.py    Apache License 2.0 5 votes vote down vote up
def extract_entity_question (question):

    sample = question
    sentences = nltk.sent_tokenize(sample) #split in to sentences
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] #split in to words
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] #tag sentences with NN, NNP, etc
    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

    entity_names = []
    for tree in chunked_sentences:
        # Print result tree
        # print tree
        # Print results per sentence
        # print extract_entity_names(tree)

        entity_names.extend(extract_entity_names(tree))

    # Print all entity names
    # print entity_names

    # Remove incorrect entity "which"
    if 'Which' in entity_names:
        entity_names.remove('Which')
    if 'which' in entity_names:
        entity_names.remove('Which')

    # Print unique entity names
    # print set(entity_names)
    return entity_names 
Example 32
Project: newsclouds-engine   Author: inmagik   File: clouds.py    MIT License 5 votes vote down vote up
def get_continuous_chunks(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []

    for i in chunked:
        if i in common_articleswords:
            continue

        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    if continuous_chunk:
        named_entity = " ".join(current_chunk)
        if named_entity not in continuous_chunk:
            continuous_chunk.append(named_entity)

    return continuous_chunk 
Example 33
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    Apache License 2.0 5 votes vote down vote up
def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity 
Example 34
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    Apache License 2.0 5 votes vote down vote up
def pos_tag(docs, language=None, tagger_instance=None, doc_meta_key=None):
    """
    Apply Part-of-Speech (POS) tagging to list of documents `docs`. Either load a tagger based on supplied `language`
    or use the tagger instance `tagger` which must have a method ``tag()``. A tagger can be loaded via
    :func:`~tmtoolkit.preprocess.load_pos_tagger_for_language`.

    POS tagging so far only works for English and German. The English tagger uses the Penn Treebank tagset
    (https://ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), the
    German tagger uses STTS (http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html).

    :param docs: list of tokenized documents
    :param language: the language for the POS tagger (currently only "english" and "german" are supported) if no
                    `tagger` is given
    :param tagger_instance: a tagger instance to use for tagging if no `language` is given
    :param doc_meta_key: if this is not None, it must be a string that specifies the key that is used for the
                         resulting dicts
    :return: if `doc_meta_key` is None, return a list of N lists, where N is the number of documents; each of these
             lists contains the POS tags for the respective tokens from `docs`, hence each POS list has the same length
             as the respective token list of the corresponding document; if `doc_meta_key` is not None, the result list
             contains dicts with the only key `doc_meta_key` that maps to the list of POS tags for the corresponding
             document
    """
    require_listlike(docs)

    if tagger_instance is None:
        tagger_instance, _ = load_pos_tagger_for_language(language or defaults.language)

    docs_meta = []
    for dtok in docs:
        if len(dtok) > 0:
            tokens_and_tags = tagger_instance.tag(dtok)
            tags = list(list(zip(*tokens_and_tags))[1])
        else:
            tags = []

        if doc_meta_key:
            docs_meta.append({doc_meta_key: tags})
        else:
            docs_meta.append(tags)

    return docs_meta 
Example 35
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    Apache License 2.0 5 votes vote down vote up
def lemmatize(docs, docs_meta, language=None, lemmatizer_fn=None):
    """
    Lemmatize documents according to `language` or use a custom lemmatizer function `lemmatizer_fn`.

    :param docs: list of tokenized documents
    :param docs_meta: list of meta data for each document in `docs` or list of POS tags per document; for option 1,
                      each element at index ``i`` is a dict containing the meta data for document ``i`` and each dict
                      must contain an element ``meta_pos`` with a list containing a POS tag for each token in the
                      respective document; for option 2, `docs_meta` is a list of POS tags for each document as coming
                      from :func:`~tmtoolkit.preprocess.pos_tag`
    :param language: the language for which the lemmatizer should be loaded
    :param lemmatizer_fn: alternatively, use this lemmatizer function; this function should accept a tuple consisting
                          of (token, POS tag)
    :return: list of processed documents
    """
    require_listlike(docs)

    if len(docs) != len(docs_meta):
        raise ValueError('`docs` and `docs_meta` must have the same length')

    if lemmatizer_fn is None:
        lemmatizer_fn = load_lemmatizer_for_language(language or defaults.language)

    new_tokens = []
    for i, (dtok, dmeta) in enumerate(zip(docs, docs_meta)):
        if isinstance(dmeta, dict):
            if 'meta_pos' not in dmeta:
                raise ValueError('no POS meta data for document #%d' % i)
            dpos = dmeta['meta_pos']
        else:
            dpos = dmeta

        if not isinstance(dpos, (list, tuple)) or len(dpos) != len(dtok):
            raise ValueError('provided POS tags for document #%d are invalid (no list/tuple and/or not of the same '
                             'length as the document')

        new_tokens.append(list(map(lemmatizer_fn, zip(dtok, dpos))))

    return new_tokens 
Example 36
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    Apache License 2.0 5 votes vote down vote up
def tag(tokens):
        return nltk.pos_tag(tokens) 
Example 37
Project: bianalyzer   Author: luntos   File: texts.py    MIT License 5 votes vote down vote up
def tag_tokens(self):
        self.tagged_tokens = nltk.pos_tag(self.all_tokens)
        return self.tagged_tokens 
Example 38
Project: bianalyzer   Author: luntos   File: texts.py    MIT License 5 votes vote down vote up
def _tokenize_text(self, tag_pos=False, strip_stop_words=True, use_pos_filter=False):
        tokens = nltk.word_tokenize(self.text.lower())
        self.all_tokens = tokens
        if tag_pos:
            self.tagged_tokens = nltk.pos_tag(tokens)
        words = self._filter_tokens(tokens, use_pos_filter, strip_stop_words)

        return words 
Example 39
Project: bianalyzer   Author: luntos   File: texts.py    MIT License 5 votes vote down vote up
def _filter_tokens(self, tokens, filter_stop_words, use_pos_filter):
        filtered_words = []
        for word in tokens:
            word_fits = re.match('^[a-zA-Z\'-]{2,}$', word) is not None
            if use_pos_filter:
                word, pos = nltk.pos_tag([word])[0]
                word_fits = word_fits and (pos in self.pos_list)
            if filter_stop_words:
                ind = bisect_left(sorted_stop_words, word)
                word_fits = word_fits and (ind >= len(sorted_stop_words) or sorted_stop_words[ind] != word)

            if word_fits:
                filtered_words.append(word)

        return filtered_words 
Example 40
Project: arxiv2018-bayesian-ensembles   Author: UKPLab   File: hmm.py    Apache License 2.0 5 votes vote down vote up
def get_tag_t(tagger, sen, features):
    words = []
    for i in sen:
        words.append(i.word)
    x = nltk.pos_tag(words)
    x = [crf.word2features(x, i) for i in range(len(x))]
    tags = tagger.tag(x)
    return list(map(int, tags)) 
Example 41
Project: blabbr   Author: bfontaine   File: model.py    MIT License 5 votes vote down vote up
def word_split(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        words = [self.tag_sep.join((tag, word))
                 for word, tag in nltk.pos_tag(words) if word]
        return words 
Example 42
Project: democraciv-discord-bot   Author: jonasbohmann   File: law_helper.py    MIT License 5 votes vote down vote up
def generate_law_tags(google_docs_description: str, author_description: str):

        is_noun = lambda pos: pos[:2] == 'NN'

        tokenized_docs_description = nltk.word_tokenize(google_docs_description)

        tokenized_author_description = nltk.word_tokenize(author_description)

        tags = [word for (word, pos) in nltk.pos_tag(tokenized_docs_description) +
                nltk.pos_tag(tokenized_author_description) if is_noun(pos)]

        tags = list(set(tags))

        return tags 
Example 43
Project: BuildingMachineLearning   Author: ademyanchuk   File: PosTagFreqVectorizer.py    MIT License 5 votes vote down vote up
def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents) 
Example 44
Project: transformer_chatbot_experiments   Author: atselousov   File: postprocessing.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def augment_replica(seq):
    _exceptions = ['your', 'persona']
    pos2wn = {'NN': wordnet.NOUN,
              'JJ': wordnet.ADJ,
              'VBP': wordnet.VERB,
              'RB': wordnet.ADV}

    synonyms = defaultdict(list)

    tagged_seq = seq.replace('i ', 'I ')
    tagged_seq = nltk.pos_tag(nltk.word_tokenize(tagged_seq))

    for word, pos in tagged_seq:
        if pos not in pos2wn or word in _exceptions:
            continue

        pos = pos2wn[pos]
        synnets = wordnet.synsets(word, pos=pos)

        for synnet in synnets:
            for syn in synnet.lemma_names():
                if syn != word:
                    synonyms[word].append(syn.replace('_', ' '))
            break
    if synonyms:
        for key, values in synonyms.items():
            seq = seq.replace(key, random.choice(list(values)))

    return seq 
Example 45
Project: ijcai2019-relis   Author: UKPLab   File: data_helpers.py    MIT License 5 votes vote down vote up
def get_phrases(sentence, phrase_type, language):
    tokens = sent2stokens(sentence, language, lower='False')
    tagged_sentence = pos_tag(tokens)
    tags = [tag for _, tag in tagged_sentence if re.match(r'NN.*|V.*|RB|JJ', tag)]

    phrases = []
    if phrase_type == 'entities':
        chunked_sentence = ne_chunk_sents([tagged_sentence], binary=True)
        for tree in chunked_sentence:
            phrases.extend(extract_entity_names(tree))    
    return phrases 
Example 46
Project: question_answering   Author: joswinkj   File: AnswerProcessing.py    Apache License 2.0 5 votes vote down vote up
def stringProcessing(self,only_query=1):
        ''' query is a string, answers is a list of strings. returns tuples with tags, with a list covering '''
        #pdb.set_trace()
        query_posTag = [pos_tag(word_tokenize(self.query))]
        query_nerTag = [AnswerProcessor.stanford_NER_tagger(self.query)]
        self.query_tag = AnswerProcessor.PosToNer(query_posTag,query_nerTag)
        if only_query==0:
            answers_posTag = [pos_tag(word_tokenize(answer)) for answer in self.answers]
            answers_nerTag = [AnswerProcessor.stanford_NER_tagger(answer) for answer in self.answers]
            self.answers_tag = AnswerProcessor.PosToNer(answers_posTag,answers_nerTag) 
Example 47
Project: question_answering   Author: joswinkj   File: AnswerProcessing.py    Apache License 2.0 5 votes vote down vote up
def get_answer(self):
        '''
         uses answers and query to get the best answer
        '''
        #self.query_tag = pos_tag(self.query)
        self.string_cleaning()
        answer_type=self.getAnswerType()
        return self.get_best_answer_rake(answer_type) 
Example 48
Project: question_answering   Author: joswinkj   File: textProcessing.py    Apache License 2.0 5 votes vote down vote up
def load_query(self,text,tag=1):
        remove_list = ['what','where','when','who','why','how much','how many','how long','how']
        for i in remove_list:
            text=text.replace(i,'')
        text = changeToMatrix.removeStopWords([text])[0]
        if tag==1:
            text_tagged = pos_tag(word_tokenize(text))
            print text_tagged
            nouns=''
            notnouns=''
            for wrd,tkn in text_tagged:
                if tkn[0]=='N' or tkn[0]=='J' or tkn[0]=='V' or tkn[0]=='R':
                    nouns = nouns+' '+wrd
                else:
                    notnouns = notnouns+' '+wrd
        else:
            nouns = text
            notnouns = ''
        all_words=changeToMatrix.getSynonyms(nouns+' '+notnouns,useNouns=1) #will take time. synonyms of ref text is another option
        # print nouns+'\n'+notnouns
        chk_nouns=pd.DataFrame(self.vectorizer.transform([nouns]).toarray(),columns=self.vectorizer.get_feature_names()).to_sparse()
        chk_all=pd.DataFrame(self.vectorizer.transform([all_words]).toarray(),columns=self.vectorizer.get_feature_names()).to_sparse()
        #pdb.set_trace()
        #print np.sum(chk2.values)
        #chk1=pd.DataFrame(self.vectorizer1.transform([text]).toarray(),columns=self.vectorizer1.get_feature_names())
        return [chk_nouns,chk_all] 
Example 49
Project: question_answering   Author: joswinkj   File: textProcessing.py    Apache License 2.0 5 votes vote down vote up
def getSynonyms(sentence,useNouns=0):
        '''
        :param sentence:single sentence
        :param useNouns: add synonyms for nouns. if 1, synonyms of nouns also will be used
        :return: sentence with synonyms
        '''
        sent1 = word_tokenize(sentence)
        sent_tagged = pos_tag(sent1)
        words2= list(set(list({l.name() for word,tag in sent_tagged if tag[0]!='N' or useNouns==1 for s in wn.synsets(word) for l in s.lemmas()})+sent1))
        sent2 = ' '.join(words2)
        return sent2 
Example 50
Project: event-extraction   Author: nlpcl-lab   File: Dataset_Trigger.py    Apache License 2.0 5 votes vote down vote up
def next_train_data(self):
        batch_instances = self.next_batch()
        pos_tag, y, x, c, pos_c = [list() for _ in range(5)]

        for instance in batch_instances:
            words = instance['words']
            pos_taggings = instance['pos_taggings']
            marks = instance['marks']
            label = instance['label']

            index_candidates = find_candidates(marks, ['B'])
            assert (len(index_candidates)) == 1

            y.append(label)
            marks = marks + ['A'] * (self.max_sequence_length - len(marks))
            words = words + ['<eos>'] * (self.max_sequence_length - len(words))
            pos_taggings = pos_taggings + ['*'] * (self.max_sequence_length - len(pos_taggings))
            pos_taggings = list(map(lambda x: self.pos_taggings_id[x], pos_taggings))
            pos_tag.append(pos_taggings)
            index_words = list(map(lambda x: self.word_id[x], words))
            x.append(index_words)
            pos_candidate = [i for i in range(-index_candidates[0], 0)] + [i for i in range(0, self.max_sequence_length - index_candidates[0])]
            pos_c.append(pos_candidate)
            c.append([index_words[index_candidates[0]]] * self.max_sequence_length)
            assert len(words) == len(marks) == len(pos_taggings) == len(index_words) == len(pos_candidate)

        assert len(y) == len(x) == len(c) == len(pos_c) == len(pos_tag)
        return x, c, one_hot(y, self.label_id, len(self.all_labels)), pos_c, pos_tag 
Example 51
Project: event-extraction   Author: nlpcl-lab   File: Dataset_Trigger.py    Apache License 2.0 5 votes vote down vote up
def next_eval_data(self):
        batch_instances = self.eval_instances
        pos_tag, y, x, c, pos_c = [list() for _ in range(5)]

        for instance in batch_instances:
            words = instance['words']
            pos_taggings = instance['pos_taggings']
            marks = instance['marks']
            label = instance['label']

            index_candidates = find_candidates(marks, ['B'])
            assert (len(index_candidates)) == 1

            y.append(label)
            marks = marks + ['A'] * (self.max_sequence_length - len(marks))
            words = words + ['<eos>'] * (self.max_sequence_length - len(words))
            pos_taggings = pos_taggings + ['*'] * (self.max_sequence_length - len(pos_taggings))
            pos_taggings = list(map(lambda x: self.pos_taggings_id[x], pos_taggings))
            pos_tag.append(pos_taggings)
            index_words = list(map(lambda x: self.word_id[x], words))
            x.append(index_words)
            pos_candidate = [i for i in range(-index_candidates[0], 0)] + [i for i in range(0,
                                                                                            self.max_sequence_length -
                                                                                            index_candidates[0])]
            pos_c.append(pos_candidate)
            c.append([index_words[index_candidates[0]]] * self.max_sequence_length)
            assert len(words) == len(marks) == len(pos_taggings) == len(index_words) == len(pos_candidate)
        assert len(y) == len(x) == len(c) == len(pos_c) == len(pos_tag)
        return x, c, one_hot(y, self.label_id, len(self.all_labels)), pos_c, pos_tag 
Example 52
Project: event-extraction   Author: nlpcl-lab   File: Dataset_Trigger.py    Apache License 2.0 5 votes vote down vote up
def next_valid_data(self):
        batch_instances = self.valid_instances
        pos_tag, y, x, c, pos_c = [list() for _ in range(5)]

        for instance in batch_instances:
            words = instance['words']
            pos_taggings = instance['pos_taggings']
            marks = instance['marks']
            label = instance['label']

            index_candidates = find_candidates(marks, ['B'])
            assert (len(index_candidates)) == 1

            y.append(label)
            marks = marks + ['A'] * (self.max_sequence_length - len(marks))
            words = words + ['<eos>'] * (self.max_sequence_length - len(words))
            pos_taggings = pos_taggings + ['*'] * (self.max_sequence_length - len(pos_taggings))
            pos_taggings = list(map(lambda x: self.pos_taggings_id[x], pos_taggings))
            pos_tag.append(pos_taggings)
            index_words = list(map(lambda x: self.word_id[x], words))
            x.append(index_words)
            pos_candidate = [i for i in range(-index_candidates[0], 0)] + [i for i in range(0,
                                                                                            self.max_sequence_length -
                                                                                            index_candidates[0])]
            pos_c.append(pos_candidate)
            c.append([index_words[index_candidates[0]]] * self.max_sequence_length)
            assert len(words) == len(marks) == len(pos_taggings) == len(index_words) == len(pos_candidate)
        assert len(y) == len(x) == len(c) == len(pos_c) == len(pos_tag)
        return x, c, one_hot(y, self.label_id, len(self.all_labels)), pos_c, pos_tag 
Example 53
Project: gender_analysis   Author: dhmit   File: document.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_part_of_speech_tags(self):
        """
        Returns the part of speech tags as a list of tuples. The first part of each tuple is the
        term, the second one the part of speech tag.

        Note: the same word can have a different part of speech tags. In the example below,
        see "refuse" and "permit".

        :return: List of tuples (term, speech_tag)

        >>> from gender_analysis.document import Document
        >>> from pathlib import Path
        >>> from gender_analysis import common
        >>> document_metadata = {'author': 'Hawthorne, Nathaniel', 'title': 'Scarlet Letter', 'date': '1900',
        ...                   'filename': 'test_text_13.txt', 'filepath': Path(common.TEST_DATA_PATH, 'document_test_files', 'test_text_13.txt')}
        >>> document = Document(document_metadata)
        >>> document.get_part_of_speech_tags()[:4]
        [('They', 'PRP'), ('refuse', 'VBP'), ('to', 'TO'), ('permit', 'VB')]
        >>> document.get_part_of_speech_tags()[-4:]
        [('the', 'DT'), ('refuse', 'NN'), ('permit', 'NN'), ('.', '.')]

        """

        common.download_nltk_package_if_not_present('tokenizers/punkt')
        common.download_nltk_package_if_not_present('taggers/averaged_perceptron_tagger')

        text = nltk.word_tokenize(self.text)
        pos_tags = nltk.pos_tag(text)
        return pos_tags 
Example 54
Project: gender_analysis   Author: dhmit   File: gender_frequency.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_counts_by_pos(freqs):
    """
    This functions returns a dictionary where each key is a part of speech tag (e.g. 'NN' for nouns)
    and the value is a counter object of words of that part of speech and their frequencies.
    It also filters out words like "is", "the". We used `nltk`'s stop words function for filtering.

    :param freqs: Counter object of words mapped to their word count
    :return: dictionary with key as part of speech, value as Counter object of words
        (of that part of speech) mapped to their word count

    >>> get_counts_by_pos(Counter({'baked':1,'chair':3,'swimming':4}))
    {'VBN': Counter({'baked': 1}), 'NN': Counter({'chair': 3}), 'VBG': Counter({'swimming': 4})}
    >>> get_counts_by_pos(Counter({'is':10,'usually':7,'quietly':42}))
    {'RB': Counter({'quietly': 42, 'usually': 7})}

    """
    common.download_nltk_package_if_not_present('corpora/stopwords')

    sorted_words = {}
    # for each word in the counter
    for word in freqs.keys():
        # filter out if in nltk's list of stop words, e.g. 'is', 'the'
        stop_words = set(nltk.corpus.stopwords.words('english'))
        if word not in stop_words:
            # get its part of speech tag from nltk's pos_tag function
            tag = nltk.pos_tag([word])[0][1]
            # add that word to the counter object in the relevant dict entry
            if tag not in sorted_words.keys():
                sorted_words[tag] = Counter({word:freqs[word]})
            else:
                sorted_words[tag].update({word: freqs[word]})
    return sorted_words 
Example 55
Project: fact-join   Author: mouse-reeve   File: factMaker.py    MIT License 5 votes vote down vote up
def get_pos_tags(sentences):
    ''' uses nltk to tag part of speech for each sentence '''
    tagged = []
    for sentence in sentences:
        tags = pos_tag(word_tokenize(sentence))

        # throw out sentenes with no verb
        verbs = [word for word in tags if word[1].startswith('VB')]
        if len(verbs):
            tagged.append(tags)
    return tagged 
Example 56
Project: jgtextrank   Author: jerrygaoLondon   File: segmentation.py    MIT License 5 votes vote down vote up
def pos_tagging():
    """
    Use NLTK's currently recommended part of speech tagger to tag the given list of tokens.

    Usage:    pos_tag(word_tokenize(sent_content))
    :return:
    """

    return  pos_tag 
Example 57
Project: Supply-Chain-Risk-Management   Author: menikhilpandey   File: Text_Classifier.py    Apache License 2.0 5 votes vote down vote up
def converta(self,s):
        words = nltk.word_tokenize(s)
        tags = nltk.pos_tag(words)
        nouns = [t[0] for t in tags if (t[1]=="NNP" or t[1]=="NN")]
        synsets = [wn.synsets(n)[0] for n in nouns if len(wn.synsets(n))>0]
        hypernyms  = [s.hypernyms() for s in synsets if len(s.hypernyms())>0]
        #n_s = [(wn.synsets(n)[0]).hypernyms() for n in nouns if len((wn.synsets(n)[0]).hypernyms())>0]
        out = []
        for n in hypernyms:
            for x in n:
                out.append(str(x))
        return " ".join(out) 
Example 58
Project: Supply-Chain-Risk-Management   Author: menikhilpandey   File: test.py    Apache License 2.0 5 votes vote down vote up
def converta(s):
    words = nltk.word_tokenize(s)
    tags = nltk.pos_tag(words)
    nouns = [t[0] for t in tags if (t[1]=="NNP" or t[1]=="NN")]
    synsets = [wn.synsets(n)[0] for n in nouns if len(wn.synsets(n))>0]
    hypernyms  = [s.hypernyms() for s in synsets if len(s.hypernyms())>0]
    #n_s = [(wn.synsets(n)[0]).hypernyms() for n in nouns if len((wn.synsets(n)[0]).hypernyms())>0]
    out = []
    for n in hypernyms:
        for x in n:
            out.append(str(x))
    print " ".join(out) 
Example 59
Project: Supply-Chain-Risk-Management   Author: menikhilpandey   File: postagger.py    Apache License 2.0 5 votes vote down vote up
def noun_extractor(text):
    
    tokenize_text = nltk.word_tokenize(text)
    tag_sequence =  nltk.pos_tag(tokenize_text)
    #print tag_sequence
    noun_list = [item[0] for item in tag_sequence if item[1].startswith("N")]
    #print (' ').join(noun_list)
    return (' ').join(noun_list)

# text = "We are going out. Just you and me."
# print     noun_extractor(text) 
Example 60
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: PosTagFreqVectorizer.py    MIT License 5 votes vote down vote up
def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents) 
Example 61
Project: textrank   Author: acatovic   File: data.py    MIT License 5 votes vote down vote up
def filter_words(sentence):
    filtered_sentence = []
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag not in tags:
            continue
        
        if word.lower() in stop_words:
            continue
        
        filtered_sentence.append(word)
    
    return filtered_sentence 
Example 62
Project: nltk-book-2nd   Author: East196   File: name-entity-recognition.py    Apache License 2.0 5 votes vote down vote up
def process_content():
	for word in tokenized[5:]:
		words = nltk.word_tokenize(word)
		tagged = nltk.pos_tag(words)

		namedEnt = nltk.ne_chunk(tagged, binary=True)

		namedEnt.draw() 
Example 63
Project: nltk-book-2nd   Author: East196   File: chinking.py    Apache License 2.0 5 votes vote down vote up
def process_content():
	for word in tokenized[5:]:
		words = nltk.word_tokenize(word)
		tagged = nltk.pos_tag(words)
		
		# Remove verbs, prepostions, determiners, or the
		# word "to" from the chunks.
		chunkGram = r"""Chunk: {<.*>+}
								}<VB.?|IN|DT|TO>+{"""

		chunkParser = nltk.RegexpParser(chunkGram)
		chunked = chunkParser.parse(tagged)

		#print(chunked)
		chunked.draw() 
Example 64
Project: nltk-book-2nd   Author: East196   File: part-of-speech-tagging.py    Apache License 2.0 5 votes vote down vote up
def process_content():
	try:
		for word in tokenized:
			words = nltk.word_tokenize(word)
			tagged = nltk.pos_tag(words)
			print(tagged)

	except Exception as e:
		print(str(e)) 
Example 65
Project: regex4dummies   Author: DarkmatterVale   File: nltk_parser.py    MIT License 5 votes vote down vote up
def tokenize(self, tokenize_string):
        """
        Returns the tokenized version of tokenize_string, which is just
        a normal English sentence.
        """

        return nltk.pos_tag(nltk.word_tokenize(tokenize_string)) 
Example 66
Project: RTX   Author: RTXteam   File: WordnetDistance.py    MIT License 4 votes vote down vote up
def sentence_similarity(sentence1, sentence2):
	"""
	Copute sentence similarity based on wordnet
	:param sentence1: input string
	:param sentence2: input string
	:return: float between 0 and 1 giving similarity of sentences
	"""
	# Tokenize and tag
	sentence1_tagged = pos_tag(word_tokenize(sentence1))
	sentence2_tagged = pos_tag(word_tokenize(sentence2))

	# Get the synsets for the tagged words
	synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1_tagged]
	synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2_tagged]

	# Filter out the Nones
	synsets1 = [ss for ss in synsets1 if ss]
	synsets2 = [ss for ss in synsets2 if ss]

	score, count = 0.0, 0

	# For each word in the first sentence
	for synset in synsets1:
		# Get the similarity value of the most similar word in the other sentence
		vals = [synset.path_similarity(ss) for ss in synsets2]
		best_score = -1
		# Take max ignoring None's
		for val in vals:
			if val:
				if val > best_score:
					best_score = val
		if best_score == -1:
			best_score = None

		# Check that the similarity could have been computed
		if best_score is not None:
			score += best_score
			count += 1

	# Average the values
	if count != 0:
		score /= count
		#score /= (len(sentence1) + len(sentence2)) / 2.0  # divide by the mean sentence length
	else:
		score = 0.0

	# If the number of synset's is small, no confidence in similarity
	if count <= 3:
		sentence1_set = set([i.lower() for i in word_tokenize(sentence1)])
		sentence2_set = set([i.lower() for i in word_tokenize(sentence2)])
		jaccard = len(sentence1_set.intersection(sentence2_set)) / float(len(sentence1_set.union(sentence2_set)))
		score = jaccard
	#return max(score, jaccard)
	return score 
Example 67
Project: chat-simulator   Author: Shikib   File: shikib_bot.py    MIT License 4 votes vote down vote up
def train_punctuation(self):
        # Initialize POS graph
        self.punctuation_graph = {} 

        def _add_message_to_punctuation(message):
            score = message[1]
            message = message[0]

            # Remove contractions and potentially other characters
            message = \
                "".join([ch for ch in message if ch not in "'"])

            words = word_tokenize(message)
            tagged_words = pos_tag(words)

            for gram_len in range(1, self.ngram_len+1):
                # The minus one is to ensure that we always have a word
                # right after the gram
                for i in range(len(tagged_words)-gram_len+1):
                    gram = tagged_words[i:i+gram_len]
                    
                    # Turn the gram into a hashable string.
                    tags = " ".join([t[1] for t in gram])

                    next_word = None
                    
                    if i == len(tagged_words) - gram_len:
                        next_word = 'ENDMSG'
                    else:                   
                        # Identify the type of the word that comes after the gram
                        next_word = tagged_words[i+gram_len][1]

                    if tags not in self.punctuation_graph:
                        self.punctuation_graph[tags] = {}

                    if next_word not in self.punctuation_graph[tags]:
                        self.punctuation_graph[tags][next_word] = 0

                    self.punctuation_graph[tags][next_word] += score
                    
        # Need to turn the text into the right format
        messages = self.extract_messages(self.punctuation_dataset, self.user)

        for message in messages:
            _add_message_to_punctuation(message) 
Example 68
Project: chat-simulator   Author: Shikib   File: shikib_bot.py    MIT License 4 votes vote down vote up
def train_style(self):
        # Initialize POS graph
        self.style_graph = {} 

        def _add_message_to_style(message):
            score = message[1]
            message = message[0]

            # Remove contractions and potentially other characters
            message = \
                "".join([ch for ch in message if ch not in "'"])

            words = word_tokenize(message)
            tagged_words = pos_tag(words)

            for gram_len in range(1, self.ngram_len):
                # The minus one is to ensure that we always have a word
                # right after the gram
                for i in range(len(tagged_words)-gram_len+1):
                    gram = tagged_words[i:i+gram_len]
                    
                    # Turn the gram into a hashable tuple.
                    words = " ".join([t[0] for t in gram])
                    tags = " ".join([t[1] for t in gram])
                    gram_tuple = (words,tags)

                    if i == len(tagged_words) - gram_len:
                        next_word = ('ENDMSG', 'ENDMSG')
                    else:                   
                        # Identify the type of the word that comes after the gram
                        next_word = tagged_words[i+gram_len]

                    if gram_tuple not in self.style_graph:
                        self.style_graph[gram_tuple] = {}

                    if next_word not in self.style_graph[gram_tuple]:
                        self.style_graph[gram_tuple][next_word] = 0

                    self.style_graph[gram_tuple][next_word] += score
                    
        # Need to turn the text into the right format
        messages = self.extract_messages(self.style_dataset, self.user)

        for message in messages:
            _add_message_to_style(message) 
Example 69
Project: social_mind   Author: byeongkyu   File: sentence_classifier.py    Apache License 2.0 4 votes vote down vote up
def handle_domain_reply(self, msg):
        sents = self.sent_detector.tokenize(msg.reply.strip())

        msg = ReplyAnalyzed()
        msg.header.stamp = rospy.Time.now()

        for sent in sents:
            # sperate tags and text
            sent_tags = re.findall('(%[^}]+%)', sent)
            sent_text = re.sub('(%[^}]+%)', '', sent).strip()

            # if task manager select intent we use it, or we use classifier for select intent
            result = ''
            remain_tags = ''
            if not any('sm=' in tag for tag in sent_tags):
                feature = dialogue_act_features(sent_text)
                result = self.classifier.classify(feature)

                if sent_tags != []:
                    remain_tags = sent_tags[0]
            else:
                tag_text = sent_tags[0].strip('{}').split('|')
                matching = [s for s in tag_text if "sm=" in s]
                if len(matching) > 1:
                    rospy.logwarn('Only one sm tags allowed...')
                result = matching[0].split('=')[1]
                for s in tag_text:
                    if not "sm=" in s:
                        remain_tags += s + '|'
                if remain_tags != '':
                    remain_tags = '{' + remain_tags.rstrip('|') + '}'

            # select entities
            entity = EntitiesIndex()
            for i in pos_tag(word_tokenize(sent_text)):
                if(i[1] in ['RB', 'PRP', 'NN', 'PRP$']):
                    entity.entity.append(i[0])
                    entity.entity_index.append(sent_text.index(i[0]))

            msg.entities.append(entity)
            msg.sents.append(remain_tags + ' ' + sent_text)
            msg.act_type.append(result + '/%d'%len(sent_text))

        self.pub_reply_analyzed.publish(msg) 
Example 70
Project: Digital-Aristotle   Author: Gabighz   File: processing.py    GNU General Public License v3.0 4 votes vote down vote up
def pre_processing(raw_data):
    # Converts raw_data to a numpy array
    raw_data = np.array(raw_data, dtype=object)

    # Iterates through raw XML data and concatenates all text to a string
    sentence = ' '.join(raw_data[:, TEXT_INDEX])

    # Converts all words to lowercase to prevent duplication of same word with different cases.
    lowercase_string = sentence.lower()

    # Cleans each word of non-alphanumeric characters
    # e.g. so 'sensors)' and 'sensors' are not considered different words
    filtered_string = re.sub("[^a-zA-Z]", " ", lowercase_string)

    # Further filtering to keep only nouns; thus filtering stopwords as well
    # Also filters out words which have a character count of 1 or less.
    tokens = nltk.word_tokenize(filtered_string)
    tags = nltk.pos_tag(tokens)

    filtered_string = ' '.join([word for word, pos in tags
                                if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
                                and (len(word) > 1)])

    # Compiles the filtered words to an array which contains
    # each word and its XML features
    filtered_data = []
    clean_words = filtered_string.split()

    # Appends only filtered words from raw data to an array which contains
    # each word and its XML features that were in raw data
    for word_array in raw_data:

        filtered_sentence = ""
        raw_sentence = re.sub("[^a-zA-Z]", " ", word_array[0]).split()

        for i in range(len(raw_sentence)):
            if raw_sentence[i].lower() in clean_words:
                filtered_sentence = filtered_sentence + " " + raw_sentence[i]
        # !to be improved
        if len(filtered_sentence.lstrip()) > 0:
            filtered_data.append([filtered_sentence.lstrip().lower(), word_array[1], word_array[2], word_array[3]])

    return filtered_data


# Computes the F1-score of our classifier
# Takes in a 2D array which contains each observation and their label
# Compares that to the ground truth (correct) value of each observation 
Example 71
Project: pyontutils   Author: tgbugs   File: nltklib.py    MIT License 4 votes vote down vote up
def sentence_similarity(sentence1, sentence2, ignore_integers=False):
    """ compute the sentence similarity using Wordnet """
    # Tokenize and tag
    sentence1 = ' '.join([clean(word) for word in sentence1.split()])
    sentence2 = ' '.join([clean(word) for word in sentence2.split()])
    tokens1 = word_tokenize(sentence1)
    tokens2 = word_tokenize(sentence2)
    tokens1 = clean_tokens(tokens1, ignore_integers)
    tokens2 = clean_tokens(tokens2, ignore_integers)

    # tag
    sentence1 = pos_tag(tokens1)
    sentence2 = pos_tag(tokens2)

    # Get the synsets for the tagged words
    synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1]
    synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2]
    print(synsets1)
    print(synsets2)
    # Filter out the Nones
    synsets1 = [ss for ss in synsets1 if ss]
    synsets2 = [ss for ss in synsets2 if ss]

    score, count = 0.0, 0.0

    # For each word in the first sentence
    for synset1 in synsets1:
        # Get the similarity value of the most similar word in the other sentence
        best_score=[
            wn.path_similarity(synset1, synset2)
            if not isinstance(synset1, str) and not isinstance(synset2, str)
            # just in case there are scientific words wordnet does not have
            else fuzz.ratio(str(synset1), str(synset2)) / 100
            for synset2 in synsets2
        ]
        best_score=[s if s else 0 for s in best_score]
        # print(synsets1, synsets2)
        # Check that the similarity could have been computed
        if best_score:
            score += max(best_score)
            count += 1

    # Average the values
    if count > 0:
        score /= count
    else:
        score = 0

    return score 
Example 72
Project: canonicalization-server   Author: hotpxl   File: attribute_mapper.py    MIT License 4 votes vote down vote up
def canonicalize_attribute(text):
    words = common.clean_text(text).split()
    pos_tagged = [i[1] for i in nltk.pos_tag(words)]
    # Remove adverbs.
    if 1 < len(words) and words[0] not in exception_dict and pos_tagged[
            0] == 'RB':
        words.pop(0)
    # Remove common modifiers.
    if 1 < len(words) and (words[0] == 'light' or words[0] == 'dark' or
                           words[0] == 'not'):
        words.pop(0)

    pos = None
    if len(words) == 0:
        return None
    if words[0] in exception_dict:
        logger.info('{} is in exceptions'.format(words[0]))
        return exception_dict[words[0]]
    if words[0][-3:] == 'ing':
        words[0] = nltk.stem.WordNetLemmatizer().lemmatize(words[0],
                                                           wordnet.VERB)
        pos = wordnet.VERB
    elif pos_tagged[0] == 'IN' and 1 < len(words) and words[0] != 'white':
        # Do not handle prepositions.
        return None
    else:
        wn_tagged = wn_tag(words, pos_tagged)
        if 1 < len(words) and set(wn_tagged) == set([wordnet.NOUN]):
            # Most likely random nouns.
            return None
        elif wordnet.synsets(words[0], pos=wordnet.ADJ):
            pos = wordnet.ADJ
        elif wn_tagged and wn_tagged[0] == wordnet.VERB:
            pos = wordnet.VERB
        elif wn_tagged and wn_tagged[0] == wordnet.ADV:
            pos = wordnet.ADV
        elif wordnet.synsets(words[0], pos=wordnet.NOUN):
            pos = wordnet.NOUN
        else:
            # Otherwise most likely not in WordNet or misspelled
            return None
    given = wordnet.synsets(words[0], pos)
    counted = [
        p[0] for p in wordnet_helper.lemma_counter(words[0], pos).most_common()
    ]
    cap = [s for s in given if s in counted]
    if not cap:
        counted.extend(given)
        cap = counted
    selection = [s for s in given if s in cap]
    if selection:
        return selection[0]
    else:
        return None 
Example 73
Project: visual-concepts   Author: s-gupta   File: preprocess.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_vocab(imset, coco_caps, punctuations, mapping):
  image_ids = coco_caps.getImgIds()
  image_ids.sort(); t = []

  for i in xrange(len(image_ids)):
    annIds = coco_caps.getAnnIds(image_ids[i]);
    anns = coco_caps.loadAnns(annIds);
    tmp = [pos_tag( word_tokenize( str(a['caption']).lower())) for a in anns]
    t.append(tmp)

  # Make a vocabulary by computing counts of words over the whole dataset.
  t = [t3 for t1 in t for t2 in t1 for t3 in t2]
  t = [(l, 'other') if mapping.get(r) is None else (l, mapping[r]) for (l,r) in t]
  vcb = Counter(elem for elem in t)
  vcb = vcb.most_common()

  # Merge things that are in the same or similar pos
  word = [l for ((l,r),c) in vcb];
  pos = [r for ((l,r),c) in vcb];
  count = [c for ((l,r),c) in vcb];

  poss = [];
  counts = [];
  words = sorted(set(word))
  for j in xrange(len(words)):
    indexes = [i for i,x in enumerate(word) if x == words[j]]
    pos_tmp = [pos[i] for i in indexes]
    count_tmp = [count[i] for i in indexes]
    ind = np.argmax(count_tmp)
    poss.append(pos_tmp[ind])
    counts.append(sum(count_tmp))

  ind = np.argsort(counts)
  ind = ind[::-1]
  words = [words[i] for i in ind]
  poss = [poss[i] for i in ind]
  counts = [counts[i] for i in ind]

  # Remove punctuations
  non_punct = [i for (i,x) in enumerate(words) if x not in punctuations]
  words = [words[i] for i in non_punct]
  counts = [counts[i] for i in non_punct]
  poss = [poss[i] for i in non_punct]

  vocab = {'words': words, 'counts': counts, 'poss': poss};
  return vocab 
Example 74
Project: BuildingMachineLearning   Author: ademyanchuk   File: 04_sent.py    MIT License 4 votes vote down vote up
def _get_sentiments(self, d):
        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        sent = tuple(nltk.word_tokenize(d))
        if poscache is not None:
            if d in poscache:
                tagged = poscache[d]
            else:
                poscache[d] = tagged = nltk.pos_tag(sent)
        else:
            tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        for w, t in tagged:
            p, n = 0, 0
            sent_pos_type = None
            if t.startswith("NN"):
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                sent_pos_type = "r"
                adverbs += 1

            if sent_pos_type is not None:
                sent_word = "%s/%s" % (sent_pos_type, w)

                if sent_word in sent_word_net:
                    p, n = sent_word_net[sent_word]

            pos_vals.append(p)
            neg_vals.append(n)

        l = len(sent)
        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)

        return [1 - avg_pos_val - avg_neg_val, avg_pos_val, avg_neg_val,
                nouns / l, adjectives / l, verbs / l, adverbs / l] 
Example 75
Project: TheSuperQuestionTypeTopicClassifier   Author: AmirAhmadHabibi   File: eurlex_data_maker.py    GNU General Public License v3.0 4 votes vote down vote up
def lemmatise_all():
    id_mappings = pd.read_csv('./EurLex_data/eurlex_ID_mappings.csv', sep='\t')

    lemmatiser = WordNetLemmatizer()
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)
    cleaner = re.compile('^\s*-*|-\s*$')

    prog = Progresser(id_mappings.shape[0])

    for i, row in id_mappings.iterrows():
        prog.count()
        try:
            # if file already processed then continue
            if os.path.isfile('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt'):
                continue

            try:
                with open('./EurLex_data/eurlex_txt/' + str(row['DocID']) + '.txt', 'r', encoding="utf8") as infile:
                    raw_text = infile.read()
            except:
                continue

            lemmatised_doc = ''

            # lemmatise each sentence
            for sent in sent_tokenize(raw_text):
                lemmatised_sent = ''
                tokens_pos = pos_tag(word_tokenize(sent))

                # lemmatise each word in sentence
                for word_pos in tokens_pos:
                    if len(word_pos[0]) < 2: continue

                    word = word_pos[0].lower()
                    word = re.sub(cleaner, '', word)
                    if word in stop_words: continue

                    if len(word) > 2:
                        word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1]))
                        if word in stop_words: continue

                    lemmatised_sent += word + ' '
                lemmatised_doc += lemmatised_sent + '\n'
            # write doc to file
            with open('./EurLex_data/lem_txt/' + str(row['DocID']) + '-lem.txt', 'w', encoding="utf8") as outfile:
                outfile.write(lemmatised_doc)
        except Exception as e:
            print(e) 
Example 76
Project: TheSuperQuestionTypeTopicClassifier   Author: AmirAhmadHabibi   File: stackexch_data_maker.py    GNU General Public License v3.0 4 votes vote down vote up
def find_frequent_words():
    data = pd.read_csv('./StackExchange_data/all_data.csv')
    words = dict()
    lemmatiser = WordNetLemmatizer()
    stop_words = set()
    for w in stopwords.words('english'):
        stop_words.add(w)

    # with open('./StackExchange_data/words50000.csv', 'r') as infile:
    #     for line in infile:
    #         w, _, f = line.partition(',')
    #         words[w] = int(f)

    p = Progresser(data.shape[0])
    cleaner = re.compile('^\s*-*|-\s*$')

    for i, row in data.iterrows():
        # if i <= 50000:
        #     continue
        p.show_progress(i)

        tokens_pos = pos_tag(word_tokenize(row['body']))
        for word_pos in tokens_pos:
            if len(word_pos[0]) < 2:
                continue

            word = word_pos[0].lower()
            word = re.sub(cleaner, '', word)

            if word in stop_words:
                continue

            if len(word) > 2:
                word = lemmatiser.lemmatize(word=word, pos=get_wordnet_pos(word_pos[1]))

            if word not in stop_words:
                if word in words:
                    words[word] += 1
                else:
                    words[word] = 1

        if i % 5000 == 0:
            with open('./StackExchange_data/words' + str(i) + '.csv', 'w') as outfile:
                for word, freq in words.items():
                    outfile.write(word + ',' + str(freq) + '\n')

    sorted_words = sorted(words, key=lambda x: words[x], reverse=True)

    with open('./StackExchange_data/words_frequency.csv', 'w') as outfile:
        for word in sorted_words:
            try:
                outfile.write(str(word) + ',' + str(words[word]) + '\n')
            except:
                pass

    with open('./StackExchange_data/1000words.csv', 'w') as outfile:
        for word in sorted_words[:1000]:
            outfile.write(str(word) + '\n') 
Example 77
Project: question_answering   Author: joswinkj   File: AnswerProcessing.py    Apache License 2.0 4 votes vote down vote up
def get_rake_based_answ(self,answer_type):
        top = 0
        # rr = Rake()
        for sent_id,sent in enumerate(self.answers):
            # pdb.set_trace()
            # sent_bck = re.sub(r',', '',sent) #this not needed as sometimes commas are good..
            # like Japanese surrendered on September 2, ending World War II...rake phrase would be september 2 ending world war
            # without the comma
            pos_to_insert = sent[self.answers_bestpos[sent_id]:].find(' ')
            if pos_to_insert != -1:
                tmp = sent[:self.answers_bestpos[sent_id]+pos_to_insert]+' qsxwdc '+\
                      sent[self.answers_bestpos[sent_id]+pos_to_insert:]
            else:
                tmp = sent + ' qsxwdc '
            tmp = re.sub(r',', ' ',tmp)
            tmp1 = pos_tag(word_tokenize(tmp))
            tmp2=AnswerProcessor.stanford_NER_tagger(tmp)[0]
            tmp3=AnswerProcessor.PosToNer([tmp1],[[tmp2]])[0]
            ans_candidates=[]
            ans_pos=[]
            ref_pos=[ind for ind,(word,tag) in enumerate(tmp3) if word=='qsxwdc']
            ref_pos = ref_pos[0]
            tmp3=[(wrdd,tgg) for wrdd,tgg in tmp3 if wrdd!='qsxwdc']
            #for each possible answer type, add its position to a list, then find min from ref position(pos of 'qsxwdc')
            for ind,(word,tag) in enumerate(tmp3):
                if tag==answer_type:
                    ans_candidates.append(word)
                    ans_pos.append(ind)
            if len(ans_pos)==0:
                continue
            # pdb.set_trace()
            ans_pos_relative = [abs(i-ref_pos) for i in ans_pos]
            # ans_pos_relative.sort()
            ans_ind_min = ans_pos_relative.index(min(ans_pos_relative))#index of ans_pos_relative with minimum distance from ref_pos\
                                                                        #final answer should include this
            ans_ind_max=ans_ind_min+1
            tmp=ans_pos[:]
            tmp.sort()
            for i in tmp:
                if i==tmp[ans_ind_min]+1:
                    ans_ind_max += 1
            tmp.sort(reverse=True)
            for i in tmp:
                if i==ans_pos[ans_ind_min]-1:
                    ans_ind_min -= 1

            ret_word=ans_candidates[ans_ind_min:ans_ind_max]
            if top==0:
                ret_word_top=ret_word
                ret_ind = sent_id
            # pdb.set_trace()
            tmp = rr.run(sent.lower())
            rake_exp = [ph for ph,scr in tmp if ' '.join(ret_word).lower() in ph]
            # pdb.set_trace()
            if len(rake_exp)==0:
                rake_exp.append('No Rake Expression')
            print ' '.join(ret_word) + ' -Rake phrase: '+rake_exp[0] +', Full Sentence: '+sent
        return ret_ind,ret_word_top 
Example 78
Project: gender_analysis   Author: dhmit   File: dunning.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def dunning_result_to_dict(dunning_result,
                           number_of_terms_to_display=10,
                           part_of_speech_to_include=None):
    """
    Receives a dictionary of results and returns a dictionary of the top
    number_of_terms_to_display most distinctive results for each corpus that have a part of speech
    matching part_of_speech_to_include

    :param dunning_result: Dunning result dict that will be sorted through
    :param number_of_terms_to_display: Number of terms for each corpus to display
    :param part_of_speech_to_include: 'adjectives', 'adverbs', 'verbs', or 'pronouns'
    :return: dict

    """

    pos_names_to_tags = {
        'adjectives': ['JJ', 'JJR', 'JJS'],
        'adverbs': ['RB', 'RBR', 'RBS', 'WRB'],
        'verbs': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
        'pronouns': ['PRP', 'PRP$', 'WP', 'WP$']
    }
    if part_of_speech_to_include in pos_names_to_tags:
        part_of_speech_to_include = pos_names_to_tags[part_of_speech_to_include]

    final_results_dict = {}

    reverse = True
    for i in range(2):
        sorted_results = sorted(dunning_result.items(), key=lambda x: x[1]['dunning'],
                                    reverse=reverse)
        count_displayed = 0
        for result in sorted_results:
            if count_displayed == number_of_terms_to_display:
                break
            term = result[0]
            term_pos = nltk.pos_tag([term])[0][1]
            if part_of_speech_to_include and term_pos not in part_of_speech_to_include:
                continue

            final_results_dict[result[0]] = result[1]
            count_displayed += 1
        reverse = False
    return final_results_dict


################################################################################
# Visualizers
################################################################################ 
Example 79
Project: gender_analysis   Author: dhmit   File: gender_adjective.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def find_gender_adj(document, female):
    """
    Takes in a document and boolean indicating gender, and returns a dictionary of adjectives that
    appear within a window of 5 words around each pronoun

    :param: document: Document
    :param: female: boolean indicating whether to search for female adjectives (true) or male adjectives (false)
    :return: dictionary of adjectives that appear around pronouns mapped to the number of occurrences

    >>> from gender_analysis import document
    >>> from pathlib import Path
    >>> from gender_analysis import common
    >>> document_metadata = {'author': 'Hawthorne, Nathaniel', 'title': 'Scarlet Letter', 'date': '1966',
    ...                   'filename': 'test_text_7.txt', 'filepath': Path(common.TEST_DATA_PATH, 'document_test_files', 'test_text_7.txt')}
    >>> scarlett = document.Document(document_metadata)
    >>> find_gender_adj(scarlett, False)
    {'handsome': 3, 'sad': 1}

    """
    output = {}
    text = document.get_tokenized_text()

    if female:
        distances = female_instance_dist(document)
        pronouns1 = common.FEM_WORDS
        pronouns2 = common.MASC_WORDS
    else:
        distances = male_instance_dist(document)
        pronouns1 = common.MASC_WORDS
        pronouns2 = common.FEM_WORDS
    if len(distances) == 0:
        return {}
    elif len(distances) <= 3:
        lower_window_bound = 5
    else:
        lower_window_bound = median(sorted(distances)[:int(len(distances) / 2)])

    if not lower_window_bound >= 5:
        return "lower window bound less than 5"
    for l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11 in windowed(text, 11):
        l6 = l6.lower()
        if not l6 in pronouns1:
            continue
        words = [l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11]
        if bool(set(words) & set(pronouns2)):
            continue
        for index, word in enumerate(words):
            words[index] = word.lower()
        tags = nltk.pos_tag(words)
        for tag_index, tag in enumerate(tags):
            if tags[tag_index][1] == "JJ" or tags[tag_index][1] == "JJR" or tags[tag_index][1] == "JJS":
                word = words[tag_index]
                if word in output.keys():
                    output[word] += 1
                else:
                    output[word] = 1
    return output 
Example 80
Project: Building-Machine-Learning-Systems-With-Python-Second-Edition   Author: PacktPublishing   File: 04_sent.py    MIT License 4 votes vote down vote up
def _get_sentiments(self, d):
        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        sent = tuple(nltk.word_tokenize(d))
        if poscache is not None:
            if d in poscache:
                tagged = poscache[d]
            else:
                poscache[d] = tagged = nltk.pos_tag(sent)
        else:
            tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        for w, t in tagged:
            p, n = 0, 0
            sent_pos_type = None
            if t.startswith("NN"):
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                sent_pos_type = "r"
                adverbs += 1

            if sent_pos_type is not None:
                sent_word = "%s/%s" % (sent_pos_type, w)

                if sent_word in sent_word_net:
                    p, n = sent_word_net[sent_word]

            pos_vals.append(p)
            neg_vals.append(n)

        l = len(sent)
        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)

        return [1 - avg_pos_val - avg_neg_val, avg_pos_val, avg_neg_val,
                nouns / l, adjectives / l, verbs / l, adverbs / l]