Python nltk.tag() Examples
The following are 30 code examples for showing how to use nltk.tag(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: razzy-spinner Author: rafasashi File: glue.py License: GNU General Public License v3.0 | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
Example 2
Project: razzy-spinner Author: rafasashi File: glue.py License: GNU General Public License v3.0 | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example 3
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 6 votes |
def extract_JK(pos_seq): """The 'JK' method in Handler et al. 2016. Returns token positions of valid ngrams.""" def find_ngrams(input_list, num_): '''get ngrams of len n from input list''' return zip(*[input_list[i:] for i in range(num_)]) # copied from M and S chp 5''' patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN']) pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq] pos_seq = [(i, p) for i, p in enumerate(pos_seq)] ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)] def stringify(s): return "".join(a[1] for a in s) def positionify(s): return tuple(a[0] for a in s) ngrams = filter(lambda x: stringify(x) in patterns, ngrams) return [set(positionify(n)) for n in ngrams] ########
Example 4
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example 5
Project: idamagicstrings Author: joxeankoret File: IDAMagicStrings.py License: GNU Affero General Public License v3.0 | 6 votes |
def nltk_preprocess(strings): if not has_nltk: return strings = "\n".join(map(str, list(strings))) tokens = re.findall(FUNCTION_NAMES_REGEXP, strings) l = [] for token in tokens: l.append(token[0]) word_tags = nltk.pos_tag(l) for word, tag in word_tags: try: FOUND_TOKENS[word.lower()].add(tag) except: FOUND_TOKENS[word.lower()] = set([tag]) #-------------------------------------------------------------------------------
Example 6
Project: luscan-devel Author: blackye File: glue.py License: GNU General Public License v2.0 | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError, "There is no GlueDict entry for sem type of '%s'"\ " with tag '%s', and rel '%s'" %\ (node['word'], node['tag'], node['rel']) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
Example 7
Project: luscan-devel Author: blackye File: glue.py License: GNU General Public License v2.0 | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ semtype_name = None rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example 8
Project: nlp-services Author: singnet File: entity_recognizer_mod.py License: MIT License | 6 votes |
def bio_tagger(self, ne_tagged): bio_tagged = [] prev_tag = "O" for token, tag in ne_tagged: if tag == "O": # O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag return bio_tagged # Create tree
Example 9
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 6 votes |
def extract_JK(pos_seq): """The 'JK' method in Handler et al. 2016. Returns token positions of valid ngrams.""" def find_ngrams(input_list, num_): '''get ngrams of len n from input list''' return zip(*[input_list[i:] for i in range(num_)]) # copied from M and S chp 5''' patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN']) pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq] pos_seq = [(i, p) for i, p in enumerate(pos_seq)] ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)] def stringify(s): return "".join(a[1] for a in s) def positionify(s): return tuple(a[0] for a in s) ngrams = filter(lambda x: stringify(x) in patterns, ngrams) return [set(positionify(n)) for n in ngrams] ########
Example 10
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example 11
Project: NeuronBlocks Author: microsoft File: EnglishPOSTagger.py License: MIT License | 6 votes |
def postag_multi(self, multi_sentence): """ tag multiple sentences one time RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow. Args: multi_sentence: [[token1, token2], ..., [...]] Returns: """ #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence) ''' word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence) pos_lists = [] for word_pos_pairs in word_pos_pairs_multi_sent: pos_lists.append([pos for (word, pos) in word_pos_pairs]) return pos_lists ''' return [self.postag(sent) for sent in multi_sentence]
Example 12
Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: glue.py License: Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry( lookup, node['word'], node, depgraph, counter )
Example 13
Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: glue.py License: Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example 14
Project: razzy-spinner Author: rafasashi File: relextract.py License: GNU General Public License v3.0 | 5 votes |
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst)
Example 15
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 5 votes |
def coarse_tag_str(pos_seq): """Convert POS sequence to our coarse system, formatted as a string.""" global tag2coarse tags = [tag2coarse.get(tag, 'O') for tag in pos_seq] return ''.join(tags) # POS extraction assuming list of POS tags as input. # >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 4)] # >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
Example 16
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 5 votes |
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
Example 17
Project: scattertext Author: JasonKessler File: phrasemachine.py License: Apache License 2.0 | 5 votes |
def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}
Example 18
Project: seq2seq-keyphrase Author: memray File: keyphrase_test_dataset.py License: MIT License | 5 votes |
def load_xml(self, xmldir): ''' for KDD/WWW/UMD only :return: doclist ''' for filename in os.listdir(xmldir): with open(xmldir+filename) as textfile: doc = Document() doc.name = filename[:filename.find('.xml')] import string printable = set(string.printable) # print((filename)) try: lines = textfile.readlines() xml = ''.join([filter(lambda x: x in printable, l) for l in lines]) root = ET.fromstring(xml) doc.title = root.findall("title")[0].text doc.abstract = root.findall("abstract")[0].text doc.phrases = [n.text for n in root.findall("*/tag")] self.doclist.append(doc) except UnicodeDecodeError: print('UnicodeDecodeError detected! %s' % filename )
Example 19
Project: seq2seq-keyphrase Author: memray File: keyphrase_test_dataset.py License: MIT License | 5 votes |
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
Example 20
Project: seq2seq-keyphrase Author: memray File: keyphrase_test_dataset.py License: MIT License | 5 votes |
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
Example 21
Project: address_extraction Author: bagrii File: address_extract.py License: MIT License | 5 votes |
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
Example 22
Project: luscan-devel Author: blackye File: relextract.py License: GNU General Public License v2.0 | 5 votes |
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return join(lst, sep=sep) except TypeError: if untag: return join([tup[0] for tup in lst], sep=sep) from nltk.tag import tuple2str return join([tuple2str(tup) for tup in lst], sep=sep)
Example 23
Project: nlp-services Author: singnet File: entity_recognizer_mod.py License: MIT License | 5 votes |
def stanford_tagger(self, token_text): st = StanfordNERTagger(self.english_model, self.stanford_jar, encoding='utf-8') ne_tagged = st.tag(token_text) return (ne_tagged) # NLTK POS and NER taggers
Example 24
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 5 votes |
def coarse_tag_str(pos_seq): """Convert POS sequence to our coarse system, formatted as a string.""" global tag2coarse tags = [tag2coarse.get(tag,'O') for tag in pos_seq] return ''.join(tags) # POS extraction assuming list of POS tags as input. # >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 4)] # >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
Example 25
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 5 votes |
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
Example 26
Project: phrasemachine Author: slanglab File: phrasemachine.py License: MIT License | 5 votes |
def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
Example 27
Project: textfool Author: bogdan-kulynych File: paraphrase.py License: MIT License | 5 votes |
def _get_wordnet_pos(spacy_token): '''Wordnet POS tag''' pos = spacy_token.tag_[0].lower() if pos in ['a', 'n', 'v']: return pos
Example 28
Project: textfool Author: bogdan-kulynych File: paraphrase.py License: MIT License | 5 votes |
def _synonym_prefilter_fn(token, synonym): ''' Similarity heuristics go here ''' if (len(synonym.text.split()) > 2) or \ (synonym.lemma == token.lemma) or \ (synonym.tag != token.tag) or \ (token.text.lower() == 'be'): return False else: return True
Example 29
Project: NeuronBlocks Author: microsoft File: EnglishPOSTagger.py License: MIT License | 5 votes |
def postag(self, word_list): """ Args: word_list: word list Returns: pos tag list """ #word_pos_pairs = self.eng_tagger.tag(word_list) #word_pos_pairs = pos_tag(word_list) word_pos_pairs = nltk.tag._pos_tag(word_list, None, self.eng_tagger) pos_list = [pos for (word, pos) in word_pos_pairs] return pos_list
Example 30
Project: Densely-Interactive-Inference-Network Author: YichenGong File: data_processing.py License: Apache License 2.0 | 5 votes |
def parse_to_pos_vector(parse, left_padding_and_cropping_pair = (0,0)): # ONE HOT pos = parsing_parse(parse) pos_vector = [POS_dict.get(tag,0) for tag in pos] left_padding, left_cropping = left_padding_and_cropping_pair vector = np.zeros((FIXED_PARAMETERS["seq_length"],len(POS_Tagging))) assert left_padding == 0 or left_cropping == 0 for i in range(FIXED_PARAMETERS["seq_length"]): if i < len(pos_vector): vector[i + left_padding, pos_vector[i + left_cropping]] = 1 else: break return vector