Python nltk.pos_tag() Examples

The following are 30 code examples for showing how to use nltk.pos_tag(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: open-sesame   Author: swabhs   File: raw_data.py    License: Apache License 2.0 6 votes vote down vote up
def make_data_instance(text, index):
    """
    Takes a line of text and creates a CoNLL09Example instance from it.
    """
    tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())
    pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]

    lemmatized = [lemmatizer.lemmatize(tokenized[i]) 
                    if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') 
                    for i in range(len(tokenized))]

    conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
        i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))]
    elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]

    sentence = Sentence(syn_type=None, elements=elements)
    instance = CoNLL09Example(sentence, elements)

    return instance 
Example 2
Project: JARVIS   Author: abhi007tyagi   File: math_expression_calculator.py    License: Apache License 2.0 6 votes vote down vote up
def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text 
Example 3
Project: Projects   Author: iamshang1   File: sentiwordnet.py    License: MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example 4
Project: Projects   Author: iamshang1   File: combined.py    License: MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example 5
Project: self-attentive-parser   Author: nikitakit   File: nltk_plugin.py    License: MIT License 6 votes vote down vote up
def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence 
Example 6
Project: gender-bias   Author: gender-bias   File: document.py    License: MIT License 6 votes vote down vote up
def words_by_part_of_speech(self) -> dict:
        """
        Compute the parts of speech for each word in the document.

        Uses nltk.pos_tag.

        Returns:
            dict

        """
        words = self.words()
        tagged = nltk.pos_tag(words)
        categories = {}
        for _type in {t[1] for t in tagged}:
            categories[_type] = [t[0] for t in tagged if t[1] == _type]
        return categories 
Example 7
Project: partisan-discourse   Author: DistrictDataLabs   File: nlp.py    License: Apache License 2.0 6 votes vote down vote up
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
Example 8
Project: Sentence-similarity-classifier-for-pyTorch   Author: demelin   File: sick_extender.py    License: MIT License 6 votes vote down vote up
def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score 
Example 9
Project: sia-cog   Author: tech-quantum   File: nltkmgr.py    License: MIT License 6 votes vote down vote up
def tokenize(data, language="english", filterStopWords = False, tagging = False):
    result = {}
    tags = []
    filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
    sent_token = nltk.tokenize.sent_tokenize(data, language)
    word_token = nltk.tokenize.word_tokenize(data, language)
    word_token = [w for w in word_token if not w in filterChars]
    if filterStopWords is True:
        stop_words = set(stopwords.words(language))
        word_token = [w for w in word_token if not w in stop_words]

    if tagging is True:
        tags = nltk.pos_tag(word_token)

    result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
    return json.loads(jsonpickle.encode(result, unpicklable=False)) 
Example 10
Project: talk-generator   Author: korymath   File: language_util.py    License: MIT License 6 votes vote down vote up
def get_last_noun_and_article(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)

    noun = None
    for tag in reversed(tags):
        if "NN" in tag[1]:
            if noun:
                noun = (tag[0] + " " + noun).strip()
            else:
                noun = tag[0]

        # If encountering an article while there is a noun found
        elif bool(noun):
            if "DT" in tag[1] or "PRP$" in tag[1]:
                return tag[0] + " " + noun
            return noun

    return None 
Example 11
Project: idamagicstrings   Author: joxeankoret   File: IDAMagicStrings.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
Example 12
Project: Auto_ViML   Author: AutoViML   File: Auto_NLP.py    License: Apache License 2.0 6 votes vote down vote up
def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    #no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    no_punc = [w for w in lemma if w not in punc]
    no_stop = [w for w in no_punc if w not in stop_words]
    return no_stop
################################################################################################################################################################
####   THE ABOVE Process_Text secion Re-used with Permission from:
####  R O B   S A L G A D O    robert.salgado@gmail.com Thank YOU!
################################################################################ 
Example 13
Project: lexpredict-contraxsuite   Author: LexPredict   File: custom.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list 
Example 14
Project: serapis   Author: wordnik   File: annotate.py    License: MIT License 6 votes vote down vote up
def annotate_pos_with_term(sentence, term):
    """POS-tag single sentence while preserving _TERM_ using the original term"""
    try:
        pos_term = []

        # replace term if necessary
        if '_term_' not in sentence.lower():
            sentence_term = sentence.lower().replace(term.lower(), '_TERM_')
        else:
            sentence_term = sentence.lower()

        tok = word_tokenize(sentence_term)
        tags = pos_tag(tok)

        for tag in tags:
            if '_TERM_' in tag[0].upper():
                pos_term.append('_TERM_')
            else:
                pos_term.append(tag[1])

        return ' '.join(pos_term)
    except Exception, e:
        log.error('POS annotation error: %s', e)
        return None 
Example 15
Project: serapis   Author: wordnik   File: annotate.py    License: MIT License 6 votes vote down vote up
def annotate_sentence(sentence_dict, term):
    """
    Annotates a sentence object from a message with Penn Treebank POS tags.

    Args:
        sentence_dict: dict -- Must contain 's' and 's_clean', which is the
                       sentence with all occurrences of the search term
                       replaced with '_TERM-'
    Returns:
        dict -- updated sentence_dict with 'pos_tags' field.

    """
    tags = pos_tag(word_tokenize(sentence_dict['s_clean']))
    pos_tags = ['/'.join(b) for b in tags]
    sentence_dict['pos_tags'] = " ".join(pos_tags)
    sentence_dict['features'] = {}
    return sentence_dict 
Example 16
Project: open-sesame   Author: swabhs   File: xml_annotations.py    License: Apache License 2.0 5 votes vote down vote up
def normalize_tokens(self, logger):
        if len(self.stindices) != len(self.enindices):
            sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
            return
        start = {}
        idx = 0
        for s in sorted(self.stindices):
            self.stindices[s] = idx
            start[idx] = s
            idx += 1
        end = {}
        idx = 0
        for t in sorted(self.enindices):
            self.enindices[t] = idx
            end[idx] = t
            if idx > 0 and end[idx - 1] > start[idx]:
                logger.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
                return
            token = self.text[start[idx] : t + 1].strip()
            if " " in token:
                logger.write("\t\tIssue: incorrect tokenization "  + token + "\n")
                return
            if token == "": continue
            self.tokens.append(token)
            idx += 1
        try:
            self.nltkpostags = [ele[1] for ele in nltk.pos_tag(self.tokens)]
            for idx in xrange(len(self.tokens)):
                tok = self.tokens[idx]
                if self.nltkpostags[idx].startswith("V"):
                    self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
                else:
                    self.nltklemmas.append(lemmatizer.lemmatize(tok))
        except IndexError:
            print self.tokens
            print nltk.pos_tag(self.tokens)
        return True 
Example 17
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    License: MIT License 5 votes vote down vote up
def pos_tagging(targets):
    while True:
        words = (yield)
        tagged_words = nltk.pos_tag(words)

        for target in targets:
            target.send(tagged_words) 
Example 18
Project: tensorflow-XNN   Author: ChenglongChen   File: main.py    License: MIT License 5 votes vote down vote up
def lemmatize_sentence(sentence):
    res = []
    sentence_ = get_valid_words(sentence)
    for word, pos in pos_tag(sentence_):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatize_word(word, pos=wordnet_pos))
    return res 
Example 19
Project: Snowball   Author: davidsbatista   File: Tuple.py    License: GNU General Public License v3.0 5 votes vote down vote up
def construct_words_vectors(self, words, config):
        # split text into tokens and tag them using NLTK's default English tagger
        # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        text_tokens = word_tokenize(words)
        tags_ptb = pos_tag(text_tokens)
        pattern = [t[0] for t in tags_ptb if
                   t[0].lower() not in config.stopwords and t[1] not in self.filter_pos]
        if len(pattern) >= 1:
            vect_ids = self.config.vsm.dictionary.doc2bow(pattern)
            return self.config.vsm.tf_idf_model[vect_ids] 
Example 20
Project: IDEA   Author: armor-ai   File: main.py    License: MIT License 5 votes vote down vote up
def generate_labeling_candidates(OLDA_input):
    """
    Filter phrase labels and choose for candidates
    :param OLDA_input:
    :return:
    """
    phrases = {}
    for apk, item in OLDA_input.items():
        dic, _, _1, _2, _3= item
        phrases[apk] = defaultdict(int)
        # filter bigram and trigram
        for word in dic.values():
            if '_' in word:
                phrase = word
                words, tags = zip(*nltk.pos_tag(phrase.split(b'_')))
                match = False
                for tag in tags:
                    if re.match(r"^NN", tag):
                        match = True
                        continue
                    if re.match(r"DT", tag):
                        match = False
                        break
                    if re.match(r"RB", tag):
                        match = False
                        break
                for word in words:
                    if word in stopwords.words('english') + my_stoplst:     # remove stop word
                        match = False
                        break
                    if len(word) < 3:
                        match = False
                        break
                    if "\\'" in word:
                        match = False
                        break
                if match:
                    # keep phrase
                    phrases[apk][phrase] = 1
    return phrases 
Example 21
Project: geograpy2   Author: Corollarium   File: extraction.py    License: MIT License 5 votes vote down vote up
def named_entities(self):
        # word_tokenize should work well for most non-CJK languages
        text = nltk.word_tokenize(self.text)
        
        # TODO: this works only for english. Stanford's pos tagger supports
        # more languages
        # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        # http://stackoverflow.com/questions/1639855/pos-tagging-in-german
        # PT corpus http://aelius.sourceforge.net/manual.html
        # 
        pos_tag = nltk.pos_tag(text)
        
        nes = nltk.ne_chunk(pos_tag)
        return nes 
Example 22
Project: EliIE   Author: Tian312   File: POS.py    License: MIT License 5 votes vote down vote up
def pos_tagging(term_list):
    #print term_list
    term_list_new=list()
    for term in term_list:
        term=term.decode('utf-8', 'ignore')
        term_list_new.append(term)
    tag=nltk.pos_tag(term_list_new)
    t=list()
    for ta in tag:
       t.append(ta[1])
    return t 
Example 23
Project: chowmein   Author: xiaohan2012   File: corpus_processor.py    License: MIT License 5 votes vote down vote up
def __init__(self, pos_tag_func=nltk.pos_tag):
        """
        Parameter:
        --------------
        pos_tag_func: pos_tag function that accepts list of tokens
            and POS tag them
        """
        self._pos_tag_func = pos_tag_func 
Example 24
Project: JARVIS   Author: abhi007tyagi   File: command_identifier.py    License: Apache License 2.0 5 votes vote down vote up
def get_command(text):
    # tokenize and remove stop words
    tokenized = nltk.word_tokenize(text)

    # stop_words = set(stopwords.words("english"))
    # filtered_text = [w for w in tokenized if not w in stop_words]
    # print("fitered text -> ", filtered_text)

    #  tag the filtered words
    tags = nltk.pos_tag(tokenized)
    print("filtered tags -> ", tags)

    return extract_cmd(tags) 
Example 25
Project: DeepLearn   Author: GauravBh1010tt   File: rd_ft.py    License: MIT License 5 votes vote down vote up
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return nouns

#Average Edit Distance Value For Two String And The Average Edit Distance Between The Nouns Present In Them(Returns Float) 
Example 26
Project: g2p   Author: Kyubyong   File: g2p.py    License: Apache License 2.0 5 votes vote down vote up
def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        text = ''.join(char for char in unicodedata.normalize('NFD', text)
                       if unicodedata.category(char) != 'Mn')  # Strip accents
        text = text.lower()
        text = re.sub("[^ a-z'.,?!\-]", "", text)
        text = text.replace("i.e.", "that is")
        text = text.replace("e.g.", "for example")

        # tokenization
        words = word_tokenize(text)
        tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word, pos in tokens:
            if re.search("[a-z]", word) is None:
                pron = [word]

            elif word in self.homograph2features:  # Check homograph
                pron1, pron2, pos1 = self.homograph2features[word]
                if pos.startswith(pos1):
                    pron = pron1
                else:
                    pron = pron2
            elif word in self.cmu:  # lookup CMU dict
                pron = self.cmu[word][0]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        return prons[:-1] 
Example 27
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 5 votes vote down vote up
def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity 
Example 28
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    License: Apache License 2.0 5 votes vote down vote up
def pos_tag(docs, language=None, tagger_instance=None, doc_meta_key=None):
    """
    Apply Part-of-Speech (POS) tagging to list of documents `docs`. Either load a tagger based on supplied `language`
    or use the tagger instance `tagger` which must have a method ``tag()``. A tagger can be loaded via
    :func:`~tmtoolkit.preprocess.load_pos_tagger_for_language`.

    POS tagging so far only works for English and German. The English tagger uses the Penn Treebank tagset
    (https://ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html), the
    German tagger uses STTS (http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html).

    :param docs: list of tokenized documents
    :param language: the language for the POS tagger (currently only "english" and "german" are supported) if no
                    `tagger` is given
    :param tagger_instance: a tagger instance to use for tagging if no `language` is given
    :param doc_meta_key: if this is not None, it must be a string that specifies the key that is used for the
                         resulting dicts
    :return: if `doc_meta_key` is None, return a list of N lists, where N is the number of documents; each of these
             lists contains the POS tags for the respective tokens from `docs`, hence each POS list has the same length
             as the respective token list of the corresponding document; if `doc_meta_key` is not None, the result list
             contains dicts with the only key `doc_meta_key` that maps to the list of POS tags for the corresponding
             document
    """
    require_listlike(docs)

    if tagger_instance is None:
        tagger_instance, _ = load_pos_tagger_for_language(language or defaults.language)

    docs_meta = []
    for dtok in docs:
        if len(dtok) > 0:
            tokens_and_tags = tagger_instance.tag(dtok)
            tags = list(list(zip(*tokens_and_tags))[1])
        else:
            tags = []

        if doc_meta_key:
            docs_meta.append({doc_meta_key: tags})
        else:
            docs_meta.append(tags)

    return docs_meta 
Example 29
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    License: Apache License 2.0 5 votes vote down vote up
def lemmatize(docs, docs_meta, language=None, lemmatizer_fn=None):
    """
    Lemmatize documents according to `language` or use a custom lemmatizer function `lemmatizer_fn`.

    :param docs: list of tokenized documents
    :param docs_meta: list of meta data for each document in `docs` or list of POS tags per document; for option 1,
                      each element at index ``i`` is a dict containing the meta data for document ``i`` and each dict
                      must contain an element ``meta_pos`` with a list containing a POS tag for each token in the
                      respective document; for option 2, `docs_meta` is a list of POS tags for each document as coming
                      from :func:`~tmtoolkit.preprocess.pos_tag`
    :param language: the language for which the lemmatizer should be loaded
    :param lemmatizer_fn: alternatively, use this lemmatizer function; this function should accept a tuple consisting
                          of (token, POS tag)
    :return: list of processed documents
    """
    require_listlike(docs)

    if len(docs) != len(docs_meta):
        raise ValueError('`docs` and `docs_meta` must have the same length')

    if lemmatizer_fn is None:
        lemmatizer_fn = load_lemmatizer_for_language(language or defaults.language)

    new_tokens = []
    for i, (dtok, dmeta) in enumerate(zip(docs, docs_meta)):
        if isinstance(dmeta, dict):
            if 'meta_pos' not in dmeta:
                raise ValueError('no POS meta data for document #%d' % i)
            dpos = dmeta['meta_pos']
        else:
            dpos = dmeta

        if not isinstance(dpos, (list, tuple)) or len(dpos) != len(dtok):
            raise ValueError('provided POS tags for document #%d are invalid (no list/tuple and/or not of the same '
                             'length as the document')

        new_tokens.append(list(map(lemmatizer_fn, zip(dtok, dpos))))

    return new_tokens 
Example 30
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    License: Apache License 2.0 5 votes vote down vote up
def tag(tokens):
        return nltk.pos_tag(tokens)