from __future__ import print_function, unicode_literals, division

from pyrouge.utils import log
from pyrouge.utils.string_utils import cleanup
from pyrouge.utils.file_utils import DirectoryProcessor


class PunktSentenceSplitter:
    """
    Splits sentences using the NLTK Punkt sentence tokenizer. If installed,
    PunktSentenceSplitter can use the default NLTK data for English, otherwise
    custom trained data has to be provided.

    """

    def __init__(self, language="en", punkt_data_path=None):
        self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
        self.log = log.get_global_console_logger()
        try:
            import nltk.data
        except ImportError:
            self.log.error(
                "Cannot import NLTK data for the sentence splitter. Please "
                "check if the 'punkt' NLTK-package is installed correctly.")
        try:
            if not punkt_data_path:
                punkt_data_path = self.lang2datapath[language]
            self.sent_detector = nltk.data.load(punkt_data_path)
        except KeyError:
            self.log.error(
                "No sentence splitter data for language {}.".format(language))
        except:
            self.log.error(
                "Could not load sentence splitter data: {}".format(
                    self.lang2datapath[language]))

    def split(self, text):
        """Splits text and returns a list of the resulting sentences."""
        text = cleanup(text)
        return self.sent_detector.tokenize(text.strip())

    @staticmethod
    def split_files(input_dir, output_dir, lang="en", punkt_data_path=None):
        ss = PunktSentenceSplitter(lang, punkt_data_path)
        DirectoryProcessor.process(input_dir, output_dir, ss.split)

if __name__ == '__main__':
    text = "Punkt knows that the periods in Mr. Smith and Johann S. Bach do "
    "not mark sentence boundaries.  And sometimes sentences can start with "
    "non-capitalized words. i is a good variable name."
    ss = PunktSentenceSplitter()
    print(ss.split(text))