Python nltk.tag() Examples

The following are code examples for showing how to use nltk.tag(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: razzy-spinner   Author: rafasashi   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 2
Project: razzy-spinner   Author: rafasashi   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 3
Project: OpenBottle   Author: xiaozhuchacha   File: glue.py    MIT License 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 4
Project: OpenBottle   Author: xiaozhuchacha   File: glue.py    MIT License 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 5
Project: OpenBottle   Author: xiaozhuchacha   File: glue.py    MIT License 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 6
Project: OpenBottle   Author: xiaozhuchacha   File: glue.py    MIT License 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 7
Project: Health-Checker   Author: KriAga   File: glue.py    MIT License 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 8
Project: Health-Checker   Author: KriAga   File: glue.py    MIT License 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 9
Project: Text-mining-arabic   Author: ahmadRagheb   File: Song.py    MIT License 6 votes vote down vote up
def part_of_speeach(self):

        os.environ["JAVA_HOME"] = "/usr/bin/java"
        jar = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/stanford-postagger.jar'
        model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-full-2015-12-09/models/arabic.tagger'
        # model = '/home/ahmad/PycharmProjects/untitled1/stanford-postagger-2011-04-20/models/left3words-wasj-0-18.tagger'
        tagger = StanfordPOSTagger(model, jar)
        tagger.java_options = '-mx4096m'  ### Setting higher memory limit for long sentences

        text = tagger.tag(word_tokenize(self.lyrics[0]))
        s = ''
        for i in text:
            f = i[0] + ' ' + i[1]
            s = s + f + ' / '
        return s

# Writing to FILE
# Use for One time 
Example 10
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    Apache License 2.0 6 votes vote down vote up
def extract_JK(pos_seq):
	"""The 'JK' method in Handler et al. 2016.
	Returns token positions of valid ngrams."""

	def find_ngrams(input_list, num_):
		'''get ngrams of len n from input list'''
		return zip(*[input_list[i:] for i in range(num_)])

	# copied from M and S chp 5'''
	patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN'])
	pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	pos_seq = [(i, p) for i, p in enumerate(pos_seq)]
	ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)]

	def stringify(s):
		return "".join(a[1] for a in s)

	def positionify(s):
		return tuple(a[0] for a in s)

	ngrams = filter(lambda x: stringify(x) in patterns, ngrams)
	return [set(positionify(n)) for n in ngrams]


######## 
Example 11
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example 12
Project: jroc   Author: domenicosolazzo   File: StanfordTagger.py    GNU General Public License v3.0 6 votes vote down vote up
def __bio_tagger(self, ne_tagged):
        """
        Return BIO tags from named entities
        :ne_tagged: name_entities tokens
        """
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O": #O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O": # Begin NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag: # Inside NE
                bio_tagged.append((token, "I-"+tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
                bio_tagged.append((token, "B-"+tag))
                prev_tag = tag
        return bio_tagged 
Example 13
Project: FancyWord   Author: EastonLee   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 14
Project: FancyWord   Author: EastonLee   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 15
Project: idamagicstrings   Author: joxeankoret   File: IDAMagicStrings.py    GNU Affero General Public License v3.0 6 votes vote down vote up
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
Example 16
Project: nltk-on-gae   Author: sivu22   File: glue.py    Apache License 2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError("There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel']))

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 17
Project: nltk-on-gae   Author: sivu22   File: glue.py    Apache License 2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 18
Project: luscan-devel   Author: blackye   File: glue.py    GNU General Public License v2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
#            raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError, "There is no GlueDict entry for sem type of '%s'"\
                    " with tag '%s', and rel '%s'" %\
                    (node['word'], node['tag'], node['rel'])

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 19
Project: luscan-devel   Author: blackye   File: glue.py    GNU General Public License v2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        semtype_name = None

        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 20
Project: nlp-services   Author: singnet   File: entity_recognizer_mod.py    MIT License 6 votes vote down vote up
def bio_tagger(self, ne_tagged):
        bio_tagged = []
        prev_tag = "O"
        for token, tag in ne_tagged:
            if tag == "O":  # O
                bio_tagged.append((token, tag))
                prev_tag = tag
                continue
            if tag != "O" and prev_tag == "O":  # Begin NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag == tag:  # Inside NE
                bio_tagged.append((token, "I-" + tag))
                prev_tag = tag
            elif prev_tag != "O" and prev_tag != tag:  # Adjacent NE
                bio_tagged.append((token, "B-" + tag))
                prev_tag = tag
        return bio_tagged

    # Create tree 
Example 21
Project: honours_project   Author: JFriel   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 22
Project: honours_project   Author: JFriel   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 23
Project: honours_project   Author: JFriel   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 24
Project: honours_project   Author: JFriel   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 25
Project: NeuronBlocks   Author: microsoft   File: EnglishPOSTagger.py    MIT License 6 votes vote down vote up
def postag_multi(self, multi_sentence):
        """ tag multiple sentences one time
        RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow.
        Args:
            multi_sentence: [[token1, token2], ..., [...]]
        Returns:
        """
        #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence)
        '''
        word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence)
        pos_lists = []
        for word_pos_pairs in word_pos_pairs_multi_sent:
            pos_lists.append([pos for (word, pos) in word_pos_pairs])
        return pos_lists
        '''
        return [self.postag(sent) for sent in multi_sentence] 
Example 26
Project: aop-helpFinder   Author: jecarvaill   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 27
Project: aop-helpFinder   Author: jecarvaill   File: glue.py    GNU General Public License v3.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 28
Project: serverless-chatbots-workshop   Author: datteswararao   File: glue.py    Apache License 2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 29
Project: serverless-chatbots-workshop   Author: datteswararao   File: glue.py    Apache License 2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 30
Project: serverless-chatbots-workshop   Author: datteswararao   File: glue.py    Apache License 2.0 6 votes vote down vote up
def lookup(self, node, depgraph, counter):
        semtype_names = self.get_semtypes(node)

        semtype = None
        for name in semtype_names:
            if name in self:
                semtype = self[name]
                break
        if semtype is None:
            # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
            return []

        self.add_missing_dependencies(node, depgraph)

        lookup = self._lookup_semtype_option(semtype, node, depgraph)

        if not len(lookup):
            raise KeyError(
                "There is no GlueDict entry for sem type of '%s' "
                "with tag '%s', and rel '%s'" %
                (node['word'], node['tag'], node['rel'])
                )

        return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) 
Example 31
Project: serverless-chatbots-workshop   Author: datteswararao   File: glue.py    Apache License 2.0 6 votes vote down vote up
def get_semtypes(self, node):
        """
        Based on the node, return a list of plausible semtypes in order of
        plausibility.
        """
        rel = node['rel'].lower()
        word = node['word'].lower()

        if rel == 'spec':
            if word in SPEC_SEMTYPES:
                return [SPEC_SEMTYPES[word]]
            else:
                return [SPEC_SEMTYPES['default']]
        elif rel in ['nmod', 'vmod']:
            return [node['tag'], rel]
        else:
            return [node['tag']] 
Example 32
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: biotagger.py    MIT License 6 votes vote down vote up
def bio_tagger(ne_tagged):
		bio_tagged = []
		prev_tag = "O"
		for token, tag in ne_tagged:
			if tag == "O": #O
				bio_tagged.append((token, tag))
				prev_tag = tag
				continue
			if tag != "O" and prev_tag == "O": # Begin NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag == tag: # Inside NE
				bio_tagged.append((token, "I-"+tag))
				prev_tag = tag
			elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
				bio_tagged.append((token, "B-"+tag))
				prev_tag = tag
		return bio_tagged
    
# Create tree 
Example 33
Project: razzy-spinner   Author: rafasashi   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 34
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 35
Project: Health-Checker   Author: KriAga   File: relextract.py    MIT License 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 36
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    Apache License 2.0 5 votes vote down vote up
def coarse_tag_str(pos_seq):
	"""Convert POS sequence to our coarse system, formatted as a string."""
	global tag2coarse
	tags = [tag2coarse.get(tag, 'O') for tag in pos_seq]
	return ''.join(tags)


# POS extraction assuming list of POS tags as input.
# >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 4)]
# >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",])
# [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] 
Example 37
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    Apache License 2.0 5 votes vote down vote up
def tag_text(self, text):
		'''take input text and return tokens w/ part of speech tags using NLTK'''
		# putting import here instead of top of file b.c. not all will have nltk installed

		sents = self.sent_detector.tokenize(text)  # TODO: this will fail on some unicode chars. I think assumes ascii
		word_pos_pairs = []

		all_tokens = []
		for sent in sents:
			tokens = self.tokenize(sent)
			all_tokens = all_tokens + tokens
			word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
		return {'tokens': all_tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
Example 38
Project: scattertext   Author: JasonKessler   File: phrasemachine.py    Apache License 2.0 5 votes vote down vote up
def tag_tokens(self, tokens):
		word_pos_pairs = self.tagger.tag(tokens)
		return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]} 
Example 39
Project: fc-aaai18   Author: thanhan   File: ner.py    MIT License 5 votes vote down vote up
def extract_ne(sents):
    from nltk.tag import StanfordNERTagger
    import nltk
    
    st = StanfordNERTagger('ner/english.all.3class.distsim.crf.ser.gz', 'ner/stanford-ner.jar')
    
    
    
    sents_tk = []
    for sent in sents:
        sent_tk = nltk.word_tokenize(sent)
        sents_tk.append(sent_tk)
        
    
    ne = st.tag_sents(sents_tk)
    
    res = []    
    for sent in ne:
        last_tag = "O"
        en = ""
        sent.append(("", "O"))        
        
        for (word, tag) in sent:
            if tag == 'O':
                if en != "": res.append(en); en = ""            
            elif last_tag == tag:
                en += " " + word
            else:
                if en != "": res.append(en); en = ""
                en = word
            
            last_tag = tag
                
    return (ne, res) 
Example 40
Project: BrillPlusPlus   Author: elaheh-sadredini   File: ap-tagging-rule-gen.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def gen_single_feature(word, tag):
    regex = ""
    if word == None and tag == None:
        regex += '[^ ]+'
    else:
        if word != None:
            regex += word
        else:
            regex += '[^ ]*'
        regex += '\/'
        if tag != None:
            regex += tag
        else:
            regex += '[^ ]*'
    return regex 
Example 41
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    MIT License 5 votes vote down vote up
def load_xml(self, xmldir):
        '''
        for KDD/WWW/UMD only
        :return: doclist
        '''
        for filename in os.listdir(xmldir):
            with open(xmldir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.xml')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()
                    xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
                    root = ET.fromstring(xml)

                    doc.title = root.findall("title")[0].text
                    doc.abstract = root.findall("abstract")[0].text
                    doc.phrases = [n.text for n in root.findall("*/tag")]

                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename ) 
Example 42
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    MIT License 5 votes vote down vote up
def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records, pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: '  + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source 
Example 43
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    MIT License 5 votes vote down vote up
def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source 
Example 44
Project: jroc   Author: domenicosolazzo   File: StanfordTagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, language="en"):
        from nltk.tag import StanfordNERTagger

        self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory
        self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,)
        self.__tagger = StanfordNERTagger( self.__classifier,
                                           self.__stanfordJar,
                                           encoding="utf-8")
        self.__namedEntitiesFinder = NERFinder(language=language) 
Example 45
Project: jroc   Author: domenicosolazzo   File: StanfordTagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __tags(self, raw_text):
        """
        Return the named entities tokens given a raw text
        :raw_text: Raw text
        """
        from nltk.tokenize import word_tokenize

        if isinstance(raw_text, str):
            # Decode to utf-8
            raw_text = raw_text.decode('utf-8')
        # Tokenize the string
        token_text = word_tokenize(raw_text)
        # Retrieve the named entities from the tokens
        ne_tags = self.__tagger.tag(token_text)
        return(ne_tags) 
Example 46
Project: FancyWord   Author: EastonLee   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 47
Project: SocialNPHS   Author: SocialNPHS   File: tweet.py    MIT License 5 votes vote down vote up
def get_tweet_tags(tweet):
    """ Break up a tweet into individual word parts """
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    # replace handles with real names
    for n, tok in enumerate(tokens):
        if tok.startswith('@'):
            handle = tok.strip("@")
            if handle in user.students:
                # If we have a database entry for the mentioned user, we can
                # easily substitute a full name.
                usr = user.NPUser(handle)
                tokens[n] = usr.fullname
            else:
                # If there is no database entry, we use the user's alias. While
                # this is the full name in many cases, it is often not reliable
                usr = api.get_user(handle)
                tokens[n] = usr.name
    tagged = nltk.pos_tag(tokens)
    # In nltk, if a teacher's name is written with a period after an
    # abbreviated prefix, it is awkwardly broken up into 3 tags
    for n, tag in enumerate(tagged):
        # If there is the weird period after the prefix,
        if tag[1] == '.':
            # and it is in fact splitting up a person's name,
            if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP':
                if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']:
                    # combine it into the actual name,
                    tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0],
                                                     tagged[n + 1][0]), 'NNP')
                    # and then remove the extra tags.
                    del tagged[n + 1]
                    del tagged[n]
    return tagged 
Example 48
Project: nltk-on-gae   Author: sivu22   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep) 
Example 49
Project: polyglot-quiz   Author: erkanay   File: quiz.py    GNU General Public License v3.0 5 votes vote down vote up
def map_words(self, _text):
        mapping = defaultdict(list)
        tagged_words = pos_tag(set(self.get_words(_text)))
        for word, tag in tagged_words:
            mapping[tag].append(word)
        return mapping 
Example 50
Project: address_extraction   Author: bagrii   File: address_extract.py    MIT License 5 votes vote down vote up
def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        # Transform the result from [((w1, t1), iob1), ...]
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]

        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets) 
Example 51
Project: luscan-devel   Author: blackye   File: relextract.py    GNU General Public License v2.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return join(lst, sep=sep)
    except TypeError:
        if untag:
            return join([tup[0] for tup in lst], sep=sep)
        from nltk.tag import tuple2str
        return join([tuple2str(tup) for tup in lst], sep=sep) 
Example 52
Project: nlp-services   Author: singnet   File: entity_recognizer_mod.py    MIT License 5 votes vote down vote up
def stanford_tagger(self, token_text):
        st = StanfordNERTagger(self.english_model, self.stanford_jar, encoding='utf-8')
        ne_tagged = st.tag(token_text)
        return (ne_tagged)

    # NLTK POS and NER taggers 
Example 53
Project: honours_project   Author: JFriel   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 54
Project: textfool   Author: bogdan-kulynych   File: paraphrase.py    MIT License 5 votes vote down vote up
def _get_wordnet_pos(spacy_token):
    '''Wordnet POS tag'''
    pos = spacy_token.tag_[0].lower()
    if pos in ['a', 'n', 'v']:
        return pos 
Example 55
Project: textfool   Author: bogdan-kulynych   File: paraphrase.py    MIT License 5 votes vote down vote up
def _synonym_prefilter_fn(token, synonym):
    '''
    Similarity heuristics go here
    '''
    if  (len(synonym.text.split()) > 2) or \
        (synonym.lemma == token.lemma) or \
        (synonym.tag != token.tag) or \
        (token.text.lower() == 'be'):
        return False
    else:
        return True 
Example 56
Project: ICE   Author: shahryarabaki   File: n_grams.py    Apache License 2.0 5 votes vote down vote up
def POS_tag_tokenized_phrases(tokenized_phrases, tagger):
    return list(map(tagger.tag, tokenized_phrases)) 
Example 57
Project: NeuronBlocks   Author: microsoft   File: EnglishPOSTagger.py    MIT License 5 votes vote down vote up
def postag(self, word_list):
        """
        Args:
            word_list:  word list
        Returns:
            pos tag list
        """
        #word_pos_pairs = self.eng_tagger.tag(word_list)
        
        #word_pos_pairs = pos_tag(word_list)
        word_pos_pairs = nltk.tag._pos_tag(word_list, None, self.eng_tagger)
        pos_list = [pos for (word, pos) in word_pos_pairs]
        return pos_list 
Example 58
Project: aop-helpFinder   Author: jecarvaill   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 59
Project: serverless-chatbots-workshop   Author: datteswararao   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def _join(lst, sep=' ', untag=False):
    """
    Join a list into a string, turning tags tuples into tag strings or just words.
    :param untag: if ``True``, omit the tag from tagged input strings.
    :type lst: list
    :rtype: str
    """
    try:
        return sep.join(lst)
    except TypeError:
        if untag:
            return sep.join(tup[0] for tup in lst)
        from nltk.tag import tuple2str
        return sep.join(tuple2str(tup) for tup in lst) 
Example 60
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: stanford_ner_tagger.py    MIT License 5 votes vote down vote up
def stanford_tagger(token_text):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    ne_tagged = st.tag(token_text)
    return ne_tagged 
Example 61
Project: Real-Time-Sentiment-Analyzer-of-Twitter-Trends   Author: gauthamkrishna-g   File: biotagger.py    MIT License 5 votes vote down vote up
def stanford_tagger(token_text):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner.jar')
    ne_tagged = st.tag(token_text)
    return ne_tagged 
Example 62
Project: BrillPlusPlus   Author: elaheh-sadredini   File: ap-exp.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def gen_ap_regex():
    print "============================================================"
    print "Generate Regex from learned Brill tagging rules."
    # Parameters:
    training = my_corpus.tagged_sents()
    templates = nltk.tag.brill.fntbl37()
    n_rules = 30

    # Taggers:
    print "Initializing ..."
    regex_tagger = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'),   # articles
         (r'.*able$', 'JJ'),                # adjectives
         (r'.*ness$', 'NN'),                # nouns formed from adjectives
         (r'.*ly$', 'RB'),                  # adverbs
         (r'.*s$', 'NNS'),                  # plural nouns
         (r'.*ing$', 'VBG'),                # gerunds
         (r'.*ed$', 'VBD'),                 # past tense verbs
         (r'.*', 'NN')                      # nouns (default)
        ])
    u_gram_tag=nltk.UnigramTagger(training,backoff=regex_tagger)
    b_gram_tag=nltk.BigramTagger(training,backoff=u_gram_tag)
    t_gram_tag=nltk.TrigramTagger(training,backoff=b_gram_tag)

    print "Training brill tagger ..."
    tt = BrillTaggerTrainer(t_gram_tag, templates, trace=3)
    brill_tagger = tt.train(training, max_rules=n_rules)
    print "Training finished."

    print "Template size:", len(templates)
    range_l, range_r = get_template_range(templates)
    print "Template range:", range_l, range_r
    print "Total rules:", len(brill_tagger.rules())
    print "Generating Regex for the AP ..."

    for rule in brill_tagger.rules():
        regex, report_tag = rule_to_regex(rule, range_l, range_r)
        print report_tag, ":", regex

    print "Done."


# Cross validation 
Example 63
Project: seq2seq-keyphrase   Author: memray   File: keyphrase_test_dataset.py    MIT License 4 votes vote down vote up
def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text) 
Example 64
Project: texttk   Author: fmpr   File: texttk.py    GNU General Public License v3.0 4 votes vote down vote up
def tag_corpus_ner(self, corpus):
		if not hasattr(self, 'stanford_ner'):
			self.stanford_ner = StanfordNERTagger(self.stanford_ner_path+"classifiers/english.all.3class.distsim.crf.ser.gz", 
											self.stanford_ner_path+"stanford-ner.jar")
			self.stanford_ner._stanford_jar = self.stanford_ner_path+"stanford-ner.jar:"+self.stanford_ner_path+"lib/*"
		
		print "splitting sentences in corpus (for NER)..."
		corpus_sentences = []
		sentence_to_doc_map = {}
		sent_no = 0
		for d in xrange(len(corpus)):
			for sent in self.sentence_splitter(corpus[d]):
				corpus_sentences.append(sent)
				sentence_to_doc_map[sent_no] = d
				sent_no += 1
		tokenized_sentences = []
		for sent in corpus_sentences:
			tokenized_sentences.append([t for t in re.split(r'\s+', sent) if len(t) > 0])
		#tokenized_sentences = [re.split(r'\s+', sent) for sent in corpus_sentences]
		
		print "tagging sentences with Stanford NER..."
		tagged_sentences = self.stanford_ner.tag_sents(tokenized_sentences)
		
		# process NER output
		tagged_corpus = []
		current_doc_no = 0
		current_doc = []
		for i in xrange(len(tagged_sentences)):
			doc_no = sentence_to_doc_map[i]
			if doc_no == current_doc_no:
				current_doc += tagged_sentences[i]
			else:
				tagged_corpus.append(current_doc)
				current_doc = []
				current_doc_no = doc_no
		tagged_corpus.append(current_doc)
		
		# get dictionary of named entities per document
		named_entities = []
		for tagged_doc in tagged_corpus:
			tags = {}
			current_ne = []
			for token, tag in tagged_doc:
				if current_ne:
					if tag == "O" or (tag != "O" and tag != current_ne[-1][1]):
						tags[' '.join([t for t,_ in current_ne])] = current_ne[0][1]
						current_ne = []
				if tag != "O":
					current_ne.append((token, tag))
			if current_ne:
				tags[' '.join([t for t,_ in current_ne])] = current_ne[0][1]
			named_entities.append(tags)

		return tagged_corpus, named_entities 
Example 65
Project: lam   Author: aghie   File: preprocess.py    MIT License 4 votes vote down vote up
def remove_stopwords(docs):
  punctuation = string.punctuation + string.digits+"”"+"“"
  new_docs = OrderedDict({})
  for doc_id, doc in docs.items():
    new_docs[doc_id] = []
    for word,postag in doc:
      if word not in stopwords and word not in punctuation and not is_number(word) and len(word) > 1: #To avoid rare web characters that might be not considered among the stopword lists
        new_docs[doc_id].append((word,postag))
  return new_docs




# """
# Returns a list o docs annotated with NER information
# Each doc_i is a list of tuples (textid,[(word,tag)])
# """
# def NER(docs):
# 
#     print ("NER... (it might take some seconds/minutes)")
#     st = StanfordNERTagger('/home/david.vilares/Descargas/stanford-ner-2012-11-11-nodistsim/conll.closed.iob2.crf.ser.gz',
#                            '/home/david.vilares/Descargas/stanford-ner-2015-12-09/stanford-ner.jar',
#                            encoding='utf-8')
#     
#     new_docs = OrderedDict({})
#     #We append all docs not to be calling ther NER jar for every single document
#     aux_docs = []
#     docs_id = []
#     for doc_id, doc in docs.items():
#         aux_docs.append(doc)
#         docs_id.append(doc_id)
#     ner_docs = st.tag_sents(aux_docs)
#     
#     if len(docs_id) != len(ner_docs): raise ValueError
#     #We can do this zip because we assumed docs is an ordered dict!
#     for doc_id,ner_doc in zip(docs_id,ner_docs):
#         composed_ner = []
#         aux = []
#         for word, ner in ner_doc:
#             if len(word) > 0:
#                 if ner == 'O':
#                     #If we finished computing a multiword NER 
#                     #we needed to put it in the list first
#                     if composed_ner != []:
#                         aux.append('_'.join(composed_ner))
#                         composed_ner = []
#                     aux.append(word)
#                 else:
#                     if ner.startswith('B-') and composed_ner != []:
#                         aux.append('_'.join(composed_ner))
#                         composed_ner = [word]
#                     else:
#                         composed_ner.append(word)
#             new_docs[doc_id] = aux
#     return new_docs 
Example 66
Project: music-genre-classification   Author: sergiooramas   File: genre_classification.py    MIT License 4 votes vote down vote up
def get_sentiment_count_data(train,test):
	sent_count_train = []
	sent_count_test = []
	v = DictVectorizer(sparse=False)
	for id in test:
		dist = nltk.FreqDist(products[id]['all_pos'].split())
		new_dist = Counter()
		for tag, count in dist.iteritems():
			new_dist[map_tag('en-ptb', 'universal', tag)] += count
		Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100)
		neg_count = 0
		pos_count = 0
		suma = 0
		emotion_words = 0
		for review in products[id]['reviews']:        
			for feature,adjective,score in review['opinions']:
				if score is not None:
					if score < 0:
						neg_count += 1
					else:
						pos_count += 1
					suma += score
					emotion_words += 1
		nwords = len(products[id]['all_text'].split())
		eRatio = emotion_words*1.0/nwords
		posToAllRatio = pos_count*1.0/(pos_count+neg_count)
		emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio}
		sent_count_test.append(emotionFeatures)
	for id in train:
		dist = nltk.FreqDist(products[id]['all_pos'].split())
		new_dist = Counter()
		for tag, count in dist.iteritems():
			new_dist[map_tag('en-ptb', 'universal', tag)] += count
		Fscore = 0.5 * ((new_dist['NOUN']+new_dist['ADJ']+new_dist['ADP']+new_dist['DET']) - (dist['UH']+new_dist['VERB']+new_dist['ADV']+new_dist['PRON']) + 100)
		neg_count = 0
		pos_count = 0
		suma = 0
		emotion_words = 0
		for review in products[id]['reviews']:
			for feature,adjective,score in review['opinions']:
				if score is not None:
					if score < 0:
						neg_count += 1
					else:
						pos_count += 1
					suma += score
					emotion_words += 1
		nwords = len(products[id]['all_text'].split())
		eRatio = emotion_words*1.0/nwords
		posToAllRatio = pos_count*1.0/(pos_count+neg_count)
		emotionFeatures = {'Fscore':Fscore,'eStrength':suma*1.0/emotion_words,'eRatio':eRatio,'posToAllRatio':posToAllRatio}
		sent_count_train.append(emotionFeatures)

	X_sent_train = v.fit_transform(sent_count_train)
	X_sent_test = v.transform(sent_count_test)
	scaler = preprocessing.StandardScaler().fit(X_sent_train)
	X_train = scaler.transform(X_sent_train)
	X_test = scaler.transform(X_sent_test)

	return sent_count_train, sent_count_test, X_train, X_test