from __future__ import print_function, unicode_literals, division from pyrouge.utils import log from pyrouge.utils.string_utils import cleanup from pyrouge.utils.file_utils import DirectoryProcessor class PunktSentenceSplitter: """ Splits sentences using the NLTK Punkt sentence tokenizer. If installed, PunktSentenceSplitter can use the default NLTK data for English, otherwise custom trained data has to be provided. """ def __init__(self, language="en", punkt_data_path=None): self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} self.log = log.get_global_console_logger() try: import nltk.data except ImportError: self.log.error( "Cannot import NLTK data for the sentence splitter. Please " "check if the 'punkt' NLTK-package is installed correctly.") try: if not punkt_data_path: punkt_data_path = self.lang2datapath[language] self.sent_detector = nltk.data.load(punkt_data_path) except KeyError: self.log.error( "No sentence splitter data for language {}.".format(language)) except: self.log.error( "Could not load sentence splitter data: {}".format( self.lang2datapath[language])) def split(self, text): """Splits text and returns a list of the resulting sentences.""" text = cleanup(text) return self.sent_detector.tokenize(text.strip()) @staticmethod def split_files(input_dir, output_dir, lang="en", punkt_data_path=None): ss = PunktSentenceSplitter(lang, punkt_data_path) DirectoryProcessor.process(input_dir, output_dir, ss.split) if __name__ == '__main__': text = "Punkt knows that the periods in Mr. Smith and Johann S. Bach do " "not mark sentence boundaries. And sometimes sentences can start with " "non-capitalized words. i is a good variable name." ss = PunktSentenceSplitter() print(ss.split(text))