Python nltk.tag() Examples

The following are 30 code examples for showing how to use nltk.tag(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: glue.py    License: GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 2
Project: razzy-spinner   Author: rafasashi   File: glue.py    License: GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 3
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 6 votes vote down vote up
def extract_JK(pos_seq):
	"""The 'JK' method in Handler et al. 2016.
	Returns token positions of valid ngrams."""

	def find_ngrams(input_list, num_):
		'''get ngrams of len n from input list'''
		return zip(*[input_list[i:] for i in range(num_)])

	# copied from M and S chp 5'''
	patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
	pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
	ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]

	def stringify(s):
		return "".join(a[1] for a in s)

	def positionify(s):
		return tuple(a[0] for a in s)

	ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
	return [set(positionify(n)) for n in ngrams]


######## 
Example 4
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example 5
Project: idamagicstrings   Author: joxeankoret   File: IDAMagicStrings.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
Example 6
Project: luscan-devel   Author: blackye   File: glue.py    License: GNU General Public License v2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError, "There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel'])

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 7
Project: luscan-devel   Author: blackye   File: glue.py    License: GNU General Public License v2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 8
Project: nlp-services   Author: singnet   File: entity_recognizer_mod.py    License: MIT License 6 votes vote down vote up
def bio_tagger(self, ne_tagged):
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O":  # O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O":  # Begin NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag:  # Inside NE
                bio_tagged.append((token, "I-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
        return bio_tagged

    # Create tree 
Example 9
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 6 votes vote down vote up
def extract_JK(pos_seq):
    """The 'JK' method in Handler et al. 2016.  
    Returns token positions of valid ngrams."""

    def find_ngrams(input_list, num_):
        '''get ngrams of len n from input list'''
        return zip(*[input_list[i:] for i in range(num_)])

    # copied from M and S chp 5'''
    patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
    pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
    pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
    ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
    def stringify(s):
        return "".join(a[1] for a in s)
    def positionify(s):
        return tuple(a[0] for a in s)
    ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
    return [set(positionify(n)) for n in ngrams]

######## 
Example 10
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 6 votes vote down vote up
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
Example 11
Project: NeuronBlocks   Author: microsoft   File: EnglishPOSTagger.py    License: MIT License 6 votes vote down vote up
def postag_multi(self, multi_sentence):
        """ tag multiple sentences one time
        RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow.
        Args:
            multi_sentence: [[token1, token2], ..., [...]]
        Returns:
        """
        #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence)
        '''
        word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence)
        pos_lists = []
        for word_pos_pairs in word_pos_pairs_multi_sent:
            pos_lists.append([pos for (word, pos) in word_pos_pairs])
        return pos_lists
        '''
        return [self.postag(sent) for sent in multi_sentence] 
Example 12
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: glue.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
            )

        return self.get_glueformulas_from_semtype_entry(
            lookup, node['word'], node, depgraph, counter
        ) 
Example 13
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: glue.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 14
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 15
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 5 votes vote down vote up
def coarse_tag_str(pos_seq):
	"""Convert POS sequence to our coarse system, formatted as a string."""
	global tag2coarse
	tags = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	return ''.join(tags)


# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] 
Example 16
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 5 votes vote down vote up
def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
Example 17
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    License: Apache License 2.0 5 votes vote down vote up
def tag_tokens(self, tokens):
		word_pos_pairs = self.tagger.tag(tokens)
		return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
Example 18
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    License: MIT License 5 votes vote down vote up
def load_xml(self, xmldir):
        '''
        for KDD/WWW/UMD only
        :return: doclist
        '''
        for filename in os.listdir(xmldir):
            with open(xmldir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.xml')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()
                    xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
                    root = ET.fromstring(xml)

                    doc.title = root.findall("title")[0].text
                    doc.abstract = root.findall("abstract")[0].text
                    doc.phrases = [n.text for n in root.findall("*/tag")]

                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename ) 
Example 19
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    License: MIT License 5 votes vote down vote up
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records, pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: '  + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source 
Example 20
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    License: MIT License 5 votes vote down vote up
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source 
Example 21
Project: address_extraction   Author: bagrii   File: address_extract.py    License: MIT License 5 votes vote down vote up
def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets) 
Example 22
Project: luscan-devel   Author: blackye   File: relextract.py    License: GNU General Public License v2.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep) 
Example 23
Project: nlp-services   Author: singnet   File: entity_recognizer_mod.py    License: MIT License 5 votes vote down vote up
def stanford_tagger(self, token_text):
        st = StanfordNERTagger(self.english_model, self.stanford_jar, encoding='utf-8')
        ne_tagged = st.tag(token_text)
        return (ne_tagged)

    # NLTK POS and NER taggers 
Example 24
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 5 votes vote down vote up
def coarse_tag_str(pos_seq):
    """Convert POS sequence to our coarse system, formatted as a string."""
    global tag2coarse
    tags = [tag2coarse.get(tag,'O') for tag in pos_seq]
    return ''.join(tags)

# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] 
Example 25
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 5 votes vote down vote up
def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed
        
        sents = self.sent_detector.tokenize(text)    # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]} 
Example 26
Project: phrasemachine   Author: slanglab   File: phrasemachine.py    License: MIT License 5 votes vote down vote up
def tag_tokens(self, tokens):

        word_pos_pairs = self.tagger.tag(tokens)
        return {'tokens': tokens, 'pos': [tag for (w,tag) in word_pos_pairs]} 
Example 27
Project: textfool   Author: bogdan-kulynych   File: paraphrase.py    License: MIT License 5 votes vote down vote up
def _get_wordnet_pos(spacy_token):
    '''Wordnet POS tag'''
    pos = spacy_token.tag_[0].lower()
    if pos in ['a', 'n', 'v']:
        return pos 
Example 28
Project: textfool   Author: bogdan-kulynych   File: paraphrase.py    License: MIT License 5 votes vote down vote up
def _synonym_prefilter_fn(token, synonym):
    '''
    Similarity heuristics go here
    '''
    if  (len(synonym.text.split()) > 2) or \
        (synonym.lemma == token.lemma) or \
        (synonym.tag != token.tag) or \
        (token.text.lower() == 'be'):
        return False
    else:
        return True 
Example 29
Project: NeuronBlocks   Author: microsoft   File: EnglishPOSTagger.py    License: MIT License 5 votes vote down vote up
def postag(self, word_list):
        """
        Args:
            word_list:  word list
        Returns:
            pos tag list
        """
        #word_pos_pairs = self.eng_tagger.tag(word_list)
        
        #word_pos_pairs = pos_tag(word_list)
        word_pos_pairs = nltk.tag._pos_tag(word_list, None, self.eng_tagger)
        pos_list = [pos for (word, pos) in word_pos_pairs]
        return pos_list 
Example 30
Project: Densely-Interactive-Inference-Network   Author: YichenGong   File: data_processing.py    License: Apache License 2.0 5 votes vote down vote up
def parse_to_pos_vector(parse, left_padding_and_cropping_pair = (0,0)): # ONE HOT
    pos = parsing_parse(parse)
    pos_vector = [POS_dict.get(tag,0) for tag in pos]
    left_padding, left_cropping = left_padding_and_cropping_pair
    vector = np.zeros((FIXED_PARAMETERS["seq_length"],len(POS_Tagging)))
    assert left_padding == 0 or left_cropping == 0

    for i in range(FIXED_PARAMETERS["seq_length"]):
        if i < len(pos_vector):
            vector[i + left_padding, pos_vector[i + left_cropping]] = 1
        else:
            break
    return vector