python source code of stanford-pos-tagger

# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
#
# Copyright (C) 2001-2016 NLTK Project
# Author: Nitin Madnani <nmadnani@ets.org>
#         Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
A module for interfacing with the Stanford taggers.

Tagger models need to be downloaded from http://nlp.stanford.edu/software
and the STANFORD_MODELS environment variable set (a colon-separated
list of paths).

For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
"""

import os
import tempfile
from subprocess import PIPE
import warnings

from nltk.internals import find_file, find_jar, config_java, java, _java_options, find_jars_within_path
from nltk.tag.api import TaggerI
from nltk import compat

_stanford_url = 'http://nlp.stanford.edu/software'


class StanfordTagger(TaggerI):
    """
    An interface to Stanford taggers. Subclasses must define:

    - ``_cmd`` property: A property that returns the command that will be
      executed.
    - ``_SEPARATOR``: Class constant that represents that character that
      is used to separate the tokens from their tags.
    - ``_JAR`` file: Class constant that represents the jar file name.
    """

    _SEPARATOR = ''
    _JAR = ''

    def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'):
        if not self._JAR:
            warnings.warn('The StanfordTagger class is not meant to be '
                          'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?')
        self._stanford_jar = find_jar(
            self._JAR, path_to_jar,
            searchpath=(), url=_stanford_url,
            verbose=verbose)

        self._stanford_model = find_file(model_filename,
                                         env_vars=('STANFORD_MODELS',), verbose=verbose)

        # Adding logging jar files to classpath
        stanford_dir = os.path.split(self._stanford_jar)[0]
        self._stanford_jar = tuple(find_jars_within_path(stanford_dir))

        self._encoding = encoding
        self.java_options = java_options

    @property
    def _cmd(self):
        raise NotImplementedError


def tag(self, tokens):
    # This function should return list of tuple rather than list of list
    return sum(self.tag_sents([tokens]), [])


def tag_sents(self, sentences):
    encoding = self._encoding
    default_options = ' '.join(_java_options)
    config_java(options=self.java_options, verbose=False)

    # Create a temporary input file
    _input_fh, self._input_file_path = tempfile.mkstemp(text=True)

    cmd = list(self._cmd)
    cmd.extend(['-encoding', encoding])

    # Write the actual sentences to the temporary input file
    _input_fh = os.fdopen(_input_fh, 'wb')
    _input = '\n'.join((' '.join(x) for x in sentences))
    if isinstance(_input, compat.text_type) and encoding:
        _input = _input.encode(encoding)
    _input_fh.write(_input)
    _input_fh.close()

    # Run the tagger and get the output
    stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
                                   stdout=PIPE, stderr=PIPE)
    stanpos_output = stanpos_output.decode(encoding)

    # Delete the temporary file
    os.unlink(self._input_file_path)

    # Return java configurations to their default values
    config_java(options=default_options, verbose=False)

    return self.parse_output(stanpos_output, sentences)


def parse_output(self, text, sentences=None):
    # Output the tagged sentences
    tagged_sentences = []
    for tagged_sentence in text.strip().split("\n"):
        sentence = []
        for tagged_word in tagged_sentence.strip().split():
            word_tags = tagged_word.strip().split(self._SEPARATOR)
            sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
        tagged_sentences.append(sentence)
    return tagged_sentences

class StanfordNERTagger(StanfordTagger):
    """
    A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:

    - a model trained on training data
    - (optionally) the path to the stanford tagger jar file. If not specified here,
      then this jar file must be specified in the CLASSPATH envinroment variable.
    - (optionally) the encoding of the training data (default: UTF-8)

    Example:

        >>> from nltk.tag import StanfordNERTagger
        >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
        >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
        [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
         ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
         ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
    """

    _SEPARATOR = '/'
    _JAR = 'stanford-ner.jar'
    _FORMAT = 'slashTags'

    def __init__(self, *args, **kwargs):
        super(StanfordNERTagger, self).__init__(*args, **kwargs)