Python nltk.tag() Examples

The following are 30 code examples of nltk.tag(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: glue.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example #2
Source File: phrasemachine.py    From phrasemachine with MIT License 6 votes vote down vote up
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
Example #3
Source File: phrasemachine.py    From phrasemachine with MIT License 6 votes vote down vote up
def extract_JK(pos_seq):
    """The 'JK' method in Handler et al. 2016.  
    Returns token positions of valid ngrams."""

    def find_ngrams(input_list, num_):
        '''get ngrams of len n from input list'''
        return zip(*[input_list[i:] for i in range(num_)])

    # copied from M and S chp 5'''
    patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
    pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq]
    pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
    ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]
    def stringify(s):
        return "".join(a[1] for a in s)
    def positionify(s):
        return tuple(a[0] for a in s)
    ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
    return [set(positionify(n)) for n in ngrams]

######## 
Example #4
Source File: entity_recognizer_mod.py    From nlp-services with MIT License 6 votes vote down vote up
def bio_tagger(self, ne_tagged):
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O":  # O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O":  # Begin NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag:  # Inside NE
                bio_tagged.append((token, "I-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
        return bio_tagged

    # Create tree 
Example #5
Source File: glue.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example #6
Source File: glue.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError, "There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel'])

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example #7
Source File: IDAMagicStrings.py    From idamagicstrings with GNU Affero General Public License v3.0 6 votes vote down vote up
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
Example #8
Source File: EnglishPOSTagger.py    From NeuronBlocks with MIT License 6 votes vote down vote up
def postag_multi(self, multi_sentence):
        """ tag multiple sentences one time
        RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow.
        Args:
            multi_sentence: [[token1, token2], ..., [...]]
        Returns:
        """
        #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence)
        '''
        word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence)
        pos_lists = []
        for word_pos_pairs in word_pos_pairs_multi_sent:
            pos_lists.append([pos for (word, pos) in word_pos_pairs])
        return pos_lists
        '''
        return [self.postag(sent) for sent in multi_sentence] 
Example #9
Source File: glue.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel'])
            )

        return self.get_glueformulas_from_semtype_entry(
            lookup, node['word'], node, depgraph, counter
        ) 
Example #10
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example #11
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def extract_JK(pos_seq):
	"""The 'JK' method in Handler et al. 2016.
	Returns token positions of valid ngrams."""

	def find_ngrams(input_list, num_):
		'''get ngrams of len n from input list'''
		return zip(*[input_list[i:] for i in range(num_)])

	# copied from M and S chp 5'''
	patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
	pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
	ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]

	def stringify(s):
		return "".join(a[1] for a in s)

	def positionify(s):
		return tuple(a[0] for a in s)

	ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
	return [set(positionify(n)) for n in ngrams]


######## 
Example #12
Source File: glue.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example #13
Source File: glue.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example #14
Source File: data_processing.py    From Densely-Interactive-Inference-Network with Apache License 2.0 5 votes vote down vote up
def generate_pos_feature_tensor(parses, left_padding_and_cropping_pairs):
    pos_vectors = []
    for parse in parses:
        pos = parsing_parse(parse)
        pos_vector = [(idx, POS_dict.get(tag, 0)) for idx, tag in enumerate(pos)]
        pos_vectors.append(pos_vector)

    return construct_one_hot_feature_tensor(pos_vectors, left_padding_and_cropping_pairs, 2, column_size=len(POS_Tagging)) 
Example #15
Source File: data_processing.py    From Densely-Interactive-Inference-Network with Apache License 2.0 5 votes vote down vote up
def parse_to_pos_vector(parse, left_padding_and_cropping_pair = (0,0)): # ONE HOT
    pos = parsing_parse(parse)
    pos_vector = [POS_dict.get(tag,0) for tag in pos]
    left_padding, left_cropping = left_padding_and_cropping_pair
    vector = np.zeros((FIXED_PARAMETERS["seq_length"],len(POS_Tagging)))
    assert left_padding == 0 or left_cropping == 0

    for i in range(FIXED_PARAMETERS["seq_length"]):
        if i < len(pos_vector):
            vector[i + left_padding, pos_vector[i + left_cropping]] = 1
        else:
            break
    return vector 
Example #16
Source File: data_processing.py    From Densely-Interactive-Inference-Network with Apache License 2.0 5 votes vote down vote up
def generate_quora_pos_feature_tensor(parses, left_padding_and_cropping_pairs):
    pos_vectors = []
    for parse in parses:
        pos = parse.split()
        pos_vector = [(idx, POS_dict.get(tag, 0)) for idx, tag in enumerate(pos)]
        pos_vectors.append(pos_vector)

    return construct_one_hot_feature_tensor(pos_vectors, left_padding_and_cropping_pairs, 2, column_size=len(POS_Tagging)) 
Example #17
Source File: EnglishPOSTagger.py    From NeuronBlocks with MIT License 5 votes vote down vote up
def postag(self, word_list):
        """
        Args:
            word_list:  word list
        Returns:
            pos tag list
        """
        #word_pos_pairs = self.eng_tagger.tag(word_list)
        
        #word_pos_pairs = pos_tag(word_list)
        word_pos_pairs = nltk.tag._pos_tag(word_list, None, self.eng_tagger)
        pos_list = [pos for (word, pos) in word_pos_pairs]
        return pos_list 
Example #18
Source File: paraphrase.py    From textfool with MIT License 5 votes vote down vote up
def _synonym_prefilter_fn(token, synonym):
    '''
    Similarity heuristics go here
    '''
    if  (len(synonym.text.split()) > 2) or \
        (synonym.lemma == token.lemma) or \
        (synonym.tag != token.tag) or \
        (token.text.lower() == 'be'):
        return False
    else:
        return True 
Example #19
Source File: paraphrase.py    From textfool with MIT License 5 votes vote down vote up
def _get_wordnet_pos(spacy_token):
    '''Wordnet POS tag'''
    pos = spacy_token.tag_[0].lower()
    if pos in ['a', 'n', 'v']:
        return pos 
Example #20
Source File: phrasemachine.py    From phrasemachine with MIT License 5 votes vote down vote up
def tag_tokens(self, tokens):

        word_pos_pairs = self.tagger.tag(tokens)
        return {'tokens': tokens, 'pos': [tag for (w,tag) in word_pos_pairs]} 
Example #21
Source File: phrasemachine.py    From phrasemachine with MIT License 5 votes vote down vote up
def tag_text(self, text):
        '''take input text and return tokens w/ part of speech tags using NLTK'''
        # putting import here instead of top of file b.c. not all will have nltk installed
        
        sents = self.sent_detector.tokenize(text)    # TODO: this will fail on some unicode chars. I think assumes ascii
        word_pos_pairs = []

        all_tokens = []
        for sent in sents:
            tokens = self.tokenize(sent)
            all_tokens = all_tokens + tokens
            word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
        return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]} 
Example #22
Source File: relextract.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str

        return sep.join(tuple2str(tup) for tup in lst) 
Example #23
Source File: phrasemachine.py    From phrasemachine with MIT License 5 votes vote down vote up
def coarse_tag_str(pos_seq):
    """Convert POS sequence to our coarse system, formatted as a string."""
    global tag2coarse
    tags = [tag2coarse.get(tag,'O') for tag in pos_seq]
    return ''.join(tags)

# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] 
Example #24
Source File: entity_recognizer_mod.py    From nlp-services with MIT License 5 votes vote down vote up
def stanford_tagger(self, token_text):
        st = StanfordNERTagger(self.english_model, self.stanford_jar, encoding='utf-8')
        ne_tagged = st.tag(token_text)
        return (ne_tagged)

    # NLTK POS and NER taggers 
Example #25
Source File: relextract.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep) 
Example #26
Source File: address_extract.py    From address_extraction with MIT License 5 votes vote down vote up
def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets) 
Example #27
Source File: keyphrase_test_dataset.py    From seq2seq-keyphrase with MIT License 5 votes vote down vote up
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source 
Example #28
Source File: keyphrase_test_dataset.py    From seq2seq-keyphrase with MIT License 5 votes vote down vote up
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records, pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: '  + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source 
Example #29
Source File: keyphrase_test_dataset.py    From seq2seq-keyphrase with MIT License 5 votes vote down vote up
def load_xml(self, xmldir):
        '''
        for KDD/WWW/UMD only
        :return: doclist
        '''
        for filename in os.listdir(xmldir):
            with open(xmldir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.xml')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()
                    xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
                    root = ET.fromstring(xml)

                    doc.title = root.findall("title")[0].text
                    doc.abstract = root.findall("abstract")[0].text
                    doc.phrases = [n.text for n in root.findall("*/tag")]

                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename ) 
Example #30
Source File: phrasemachine.py    From scattertext with Apache License 2.0 5 votes vote down vote up
def tag_tokens(self, tokens):
		word_pos_pairs = self.tagger.tag(tokens)
		return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}