/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package chapter2; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.text.BreakIterator; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; /** * * @author ashish */ public class StemmingLemaEx { public static void main(String args[]){ String words[] = {"bank", "banking", "banks", "banker", "banked", "bankart"}; PorterStemmer ps = new PorterStemmer(); for(String w : words){ String stem = ps.stem(w); System.out.println("Word : " + w + " Stem : " + stem); } String paragraph = "When determining the end of sentences " + "we need to consider several factors. Sentences may end with " + "exclamation marks! Or possibly questions marks? Within " + "sentences we may find numbers like 3.14159, abbreviations " + "such as found in Mr. Smith, and possibly ellipses either " + "within a sentence …, or at the end of a sentence…"; String simple = "[.?!]"; String[] splitString = (paragraph.split(simple)); for (String string : splitString) { System.out.println(string); } System.out.println("-------------Using Pattern and Matcher-------------"); Pattern sentencePattern = Pattern.compile( "# Match a sentence ending in punctuation or EOS.\n" + "[^.!?\\s] # First char is non-punct, non-ws\n" + "[^.!?]* # Greedily consume up to punctuation.\n" + "(?: # Group for unrolling the loop.\n" + " [.!?] # (special) inner punctuation ok if\n" + " (?!['\"]?\\s|$) # not followed by ws or EOS.\n" + " [^.!?]* # Greedily consume up to punctuation.\n" + ")* # Zero or more (special normal*)\n" + "[.!?]? # Optional ending punctuation.\n" + "['\"]? # Optional closing quote.\n" + "(?=\\s|$)", Pattern.MULTILINE | Pattern.COMMENTS); Matcher matcher = sentencePattern.matcher(paragraph); while (matcher.find()) { System.out.println(matcher.group()); } System.out.println("-------------Using BreakIterator-------------"); BreakIterator si = BreakIterator.getSentenceInstance(); Locale cl = new Locale("en", "US"); si.setText(paragraph); int boundary = si.first(); while(boundary!=BreakIterator.DONE){ int begin = boundary; System.out.println(boundary + " - "); boundary = si.next(); int end = boundary; if(end == BreakIterator.DONE){ break; } System.out.println(boundary + " [ " + paragraph.substring(begin,end) + " ] "); } System.out.println("-------------Using SentenceDetectorME-------------"); try{ InputStream is = new FileInputStream(new File("/home/ashish/Downloads/" + "en-sent.bin")); SentenceModel sm = new SentenceModel(is); SentenceDetectorME detector = new SentenceDetectorME(sm); String sentences [] = detector.sentDetect(paragraph); for(String s : sentences){ System.out.println(s); } } catch(IOException e){ System.out.println("Error Detected" + e); e.printStackTrace(); } } }