java source code of DocSplitterOpenNLP15Impl

/*
 * Copyright 2008 Novamente LLC
 * Copyright 2010, 2013 Linas Vepstas <[email protected]>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package relex.corpus;

import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.util.Span;
import relex.utils.ResourceUtils;

import java.io.InputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;

/**
 * DocSplitterOpenNLP15Impl -- split document into sentences.
 *
 * This is a convenience wrapper around the opennlp-1.5.x toolkit.
 */
// @SuppressWarnings({"CallToPrintStackTrace", "UseOfSystemOutOrSystemErr"})
public class DocSplitterOpenNLP15Impl implements DocSplitter
{
	private static final int DEBUG = 0;
	private static final String DEFAULT_ENGLISH_DIR =
	        "data/opennlp/models-1.5";
	private static final String DEFAULT_ENGLISH_FILENAME =
	        "en-sent.bin";
	private static final String DEFAULT_ENGLISH_FILEPATH =
			DEFAULT_ENGLISH_DIR + "/" + DEFAULT_ENGLISH_FILENAME;

	private static HashSet<String> unacceptableSentenceEnds;

	static
	{
		// The NLP toolkit seems to work fine with Dr., Mrs. etc.
		// but chokes on Ms.
		// Unfortunately, MS can mean "Multiple Sclerosis", and
		// sentences can end with MS.
		unacceptableSentenceEnds = new HashSet<String>();
		unacceptableSentenceEnds.add("Ms.");
		// unacceptableSentenceEnds.add("MS.");
		unacceptableSentenceEnds.add("Mr.");
	}

	private static SentenceDetectorME detector;

	// Returned values
	private ArrayList<TextInterval> lst;
	private ArrayList<String> snl;

	// Buffered text, for FIFO mode.
	private String buffer;

	// parameters
	private String englishModelFilename;

	/* --------------------------------------------------------------- */
	public DocSplitterOpenNLP15Impl()
	{
		buffer = null;
		initialize();
	}

	public boolean operational()
	{
		if (detector == null) return false;
		return true;
	}

	private void initialize()
	{
		if (detector == null)
		{
			englishModelFilename = System.getProperty("EnglishModelFilename");
			if (englishModelFilename == null || englishModelFilename.isEmpty())
				englishModelFilename = DEFAULT_ENGLISH_FILEPATH;

			try
			{
				InputStream modelIn = ResourceUtils.getResource(
						"EnglishModelFilename",
						DEFAULT_ENGLISH_FILENAME,
						DEFAULT_ENGLISH_DIR
				);
				SentenceModel model = new SentenceModel(modelIn);
				modelIn.close();
				detector = new SentenceDetectorME(model);
			}
			catch (IOException e)
			{
				System.err.println(e.getMessage());
			}
		}
	}

	/* --------------------------------------------------------------- */

	public void setEnglishModelFilename(String emf)
	{
		englishModelFilename = emf;
	}

	public String getEnglishModelFilename()
	{
		return englishModelFilename;
	}

	/* --------------------------------------------------------------- */
	/**
	 * Returns false if break is unacceptable. Used to prevent overzelous
	 * sentence detectors which have recognizable idiosyncracies
	 */
	private boolean acceptableBreak(String s, int start, int end)
	{
		// If the string ends with "Ms." preceeded by whitespace
		for (String endString : unacceptableSentenceEnds)
		{
			int len = endString.length();
			if (end >= start + len &&
			    s.substring(end - len, end).equals(endString) &&
			    (end == start + len || Character.isWhitespace(s.charAt(end - len - 1))))
			{
				return false;
			}
		}

		// OpenNLP-1.5 will fail on sentences split between multiple
		// lines. Therefore, make sure we actually have punctuation
		// at the end! Otherwise, its probably only half-a-sentence.
		char e = s.charAt(end-1);
		if(e == '.') return true;
		if(e == '?') return true;
		if(e == '!') return true;
		return false;
	}

	/* --------------------------------------------------------------- */
	/**
	 * Add more text to the buffer.
	 * This allows this class to be used in FIFO mode: text is added with
	 * this call, and sentences are extracted with the getNextSentence() call.
	 */
	public void addText(String newText)
	{
		if (null == buffer) buffer = "";
		buffer += newText;
	}

	/**
	 * Clear out the sentence buffer.
	 */
	public void clearBuffer()
	{
		buffer = null;
	}

	/**
	 * Get the next sentence out of the buffered text.
	 * Return null if there are no complete sentences in the buffer.
	 */
	public String getNextSentence()
	{
		// punt if no sentence detector
		if (detector == null)
		{
			String rc = buffer;
			buffer = null;
			return rc;
		}

		Span spans[] = detector.sentPosDetect(buffer);
		if (0 == spans.length) return null;

		start = 0;
		for (Span span : spans)
		{
			end = span.getEnd();
			if (foundSentence(buffer)) break;
		}
		if (!foundSentence(buffer)) return null;

		buffer = buffer.substring(trimmedEnd);
		return trimmedSentence;
	}

	public String getRemainder()
	{
		String s = buffer;
		buffer = null;
		return s;
	}

	/* --------------------------------------------------------------- */
	private int start;
	private int end;
	private int trimmedStart;
	private int trimmedEnd;
	private String trimmedSentence;

	private Boolean foundSentence(String docText)
	{
		trimmedSentence = docText.substring(start, end).trim();
		if (trimmedSentence == null || trimmedSentence.length() == 0)
			return false;

		trimmedStart = docText.indexOf(trimmedSentence.charAt(0), start);
		trimmedEnd = trimmedStart + trimmedSentence.length();
		return acceptableBreak(docText, trimmedStart, trimmedEnd);
	}

	/* --------------------------------------------------------------- */
	/**
	 * Split a document text string into sentences.
	 * Returns a list of sentence start and end-points.
	 */
	public ArrayList<TextInterval> process(String docText)
	{
		_process(docText);
		return lst;
	}

	/**
	 * Split a document text string into sentences.
	 * Returns a list of sentence strings.
	 */
	public ArrayList<String> split(String docText)
	{
		_process(docText);
		return snl;
	}

	private void _process(String docText)
	{
		lst = new ArrayList<TextInterval>();
		snl = new ArrayList<String>();
		if (docText == null) return;

		Span spans[] = detector.sentPosDetect(buffer);

		start = 0;
		end = 0;
		for (Span span : spans)
		{
			int sentenceEnd = span.getEnd();
			int prevstart = start;
			start = end; // from previous loop iteration
			end = sentenceEnd;
			if (!foundSentence(docText))
			{
				// go back to previous start
				start = prevstart;
				end = prevstart;
				continue;
			}

			if (DEBUG > 0) System.out.println(start + "," + end + ": " + trimmedSentence);
			lst.add(new TextInterval(trimmedStart, trimmedEnd));
			snl.add(trimmedSentence);
		}
	}
}