java source code of Chapter4

package packt;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunker;
import com.aliasi.chunk.Chunking;
import com.aliasi.dict.DictionaryEntry;
import com.aliasi.dict.ExactDictionaryChunker;
import com.aliasi.dict.MapDictionary;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.util.AbstractExternalizable;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderEvaluator;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.eval.FMeasure;

public class Chapter4 {

    private static final String sentences[] = {"Joe was the last person to see Fred. ",
        "He saw him in Boston at McKenzie's pub at 3:00 where he paid "
        + "$2.45 for an ale. ",
        "Joe wanted to go to Vermont for the day to visit a cousin who "
        + "works at IBM, but Sally and he had to look for Fred"};

    private static String regularExpressionText
            = "He left his email address ([email protected]) and his "
            + "phone number,800-555-1234. We believe his current address "
            + "is 100 Washington Place, Seattle, CO 12345-1234. I "
            + "understand you can also call at 123-555-1234 between "
            + "8:00 AM and 4:30 most days. His URL is http://example.com "
            + "and he was born on February 25, 1954 or 2/25/1954.";

    private static MapDictionary<String> dictionary;

    public static void main(String[] args) {
        usingRegularExpressions();
//        usingOpenNLP();
//        usingStanfordNER();
//        usingLingPipeNER();
//        trainingOpenNLPNERModel();
    }

    public static File getModelDir() {
        return new File("C:\\Current Books in Progress\\NLP and Java\\Models");
    }

    private static void usingRegularExpressions() {
        usingJavaRegularExpressions();
//        usingLingPipeRegExChunker();
//        usingLingPipeRegularExpressions();
    }

    private static void usingJavaRegularExpressions() {
        String phoneNumberRE = "\\d{3}-\\d{3}-\\d{4}";
        String urlRegex = "\\b(https?|ftp|file|ldap)://"
                + "[-A-Za-z0-9+&@#/%?=~_|!:,.;]"
                + "*[-A-Za-z0-9+&@#/%=~_|]";
        String zipCodeRegEx = "[0-9]{5}(\\-?[0-9]{4})?";
        String emailRegEx = "[a-zA-Z0-9'._%+-]+@"
                + "(?:[a-zA-Z0-9-]+\\.)"
                + "+[a-zA-Z]{2,4}";
        String timeRE = "([01]?[0-9]|2[0-3]):[0-5][0-9]";
        String dateRE = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)";
        dateRE = "((0?[13578]|10|12)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[01]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1}))|(0?[2469]|11)(-|\\/)(([1-9])|(0[1-9])|([12])([0-9]?)|(3[0]?))(-|\\/)((19)([2-9])(\\d{1})|(20)([01])(\\d{1})|([8901])(\\d{1})))";
        Pattern pattern = Pattern.compile(phoneNumberRE + "|" + timeRE + "|" + emailRegEx);
//        regularExpressionText = "(888)555-1234 888-SEL-HIGH 888-555-1234-J88-W3S";
        Matcher matcher = pattern.matcher(regularExpressionText);
        System.out.println("---Searching ...");
        while (matcher.find()) {
            System.out.println(matcher.group() + " [" + matcher.start()
                    + ":" + matcher.end() + "]");
        }
        System.out.println("---Done Searching ...");

    }

    private static void usingLingPipeRegExChunker() {
        String timeRE = "(([0-1]?[0-9])|([2][0-3])):([0-5]?[0-9])(:([0-5]?[0-9]))?";
        Chunker chunker = new TimeRegexChunker();
//        chunker = new RegExChunker(timeRE,"time",1.0);
        Chunking chunking = chunker.chunk(regularExpressionText);
        Set<Chunk> chunkSet = chunking.chunkSet();
        displayChunkSet(chunker, regularExpressionText);
    }

    private static void usingLingPipeRegularExpressions() {
        try {
            File modelFile = new File(getModelDir(),
                    "ne-en-news-muc6.AbstractCharLmRescoringChunker");
            Chunker chunker = (Chunker) AbstractExternalizable.readObject(modelFile);
            for (int i = 0; i < sentences.length; ++i) {
                Chunking chunking = chunker.chunk(sentences[i]);
                System.out.println("Chunking=" + chunking);
            }
            for (String sentence : sentences) {
                displayChunkSet(chunker, sentence);
            }

        } catch (IOException | ClassNotFoundException ex) {
            // Handle exception
        }

    }

    // ------ OpenNLP-----------------------------------
    private static void usingOpenNLP() {
        System.out.println("OpenNLP Examples");
        usingOpenNLPNameFinderME();
//        usingMultipleNERModels();
    }

    private static void usingOpenNLPNameFinderME() {
        System.out.println("OpenNLP NameFinderME Examples");
        try (InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));
                InputStream modelStream = new FileInputStream(
                        new File(getModelDir(), "en-ner-person.bin"));) {

            TokenizerModel tokenModel = new TokenizerModel(tokenStream);
            Tokenizer tokenizer = new TokenizerME(tokenModel);

            TokenNameFinderModel entityModel
                    = new TokenNameFinderModel(modelStream);
            NameFinderME nameFinder = new NameFinderME(entityModel);

            // Single sentence
            {
                System.out.println("Single sentence");
                StringBuilder builder = new StringBuilder();
                String sentence = "He was the last person to see Fred.";

                String tokens[] = tokenizer.tokenize(sentence);
                Span nameSpans[] = nameFinder.find(tokens);

                for (int i = 0; i < nameSpans.length; i++) {
                    System.out.println("Span: " + nameSpans[i].toString());
                    System.out.println("Entity: "
                            + tokens[nameSpans[i].getStart()]);
                }
            }
            System.out.println();
            for (String sentence : sentences) {
                String tokens[] = tokenizer.tokenize(sentence);
                Span nameSpans[] = nameFinder.find(tokens);
                double[] spanProbs = nameFinder.probs(nameSpans);

                for (int i = 0; i < nameSpans.length; i++) {
                    System.out.println("Span: " + nameSpans[i].toString());
                    System.out.println("Entity: "
                            + tokens[nameSpans[i].getStart()]);
                    System.out.println("Probability: " + spanProbs[i]);
                }
                System.out.println();
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    private static void usingMultipleNERModels() {
        // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
        // en-ner-organization.bin en-ner-time.bin
        try {
            InputStream tokenStream = new FileInputStream(
                    new File(getModelDir(), "en-token.bin"));

            TokenizerModel tokenModel = new TokenizerModel(tokenStream);
            Tokenizer tokenizer = new TokenizerME(tokenModel);

            String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
                "en-ner-organization.bin"};
            ArrayList<String> list = new ArrayList();
            for (String name : modelNames) {
                TokenNameFinderModel entityModel = new TokenNameFinderModel(
                        new FileInputStream(
                                new File(getModelDir(), name)));
                NameFinderME nameFinder = new NameFinderME(entityModel);
                for (int index = 0; index < sentences.length; index++) {
                    String tokens[] = tokenizer.tokenize(sentences[index]);
                    Span nameSpans[] = nameFinder.find(tokens);
                    for (Span span : nameSpans) {
                        list.add("Sentence: " + index
                                + " Span: " + span.toString() + " Entity: "
                                + tokens[span.getStart()]);
                    }
                }
            }
            System.out.println("Multiple Entities");
            for (String element : list) {
                System.out.println(element);
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }
    }

    private static void usingStanfordNER() {
        String model = getModelDir() + "\\english.conll.4class.distsim.crf.ser.gz";
        CRFClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(model);

        String sentence = "";
        for (String element : sentences) {
            sentence += element;
        }

        List<List<CoreLabel>> entityList = classifier.classify(sentence);

        for (List<CoreLabel> internalList : entityList) {
            for (CoreLabel coreLabel : internalList) {
                String word = coreLabel.word();
                String category = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
//                System.out.println(word + ":" + category);
                if (!"O".equals(category)) {
                    System.out.println(word + ":" + category);
                }

            }

        }
    }

    private static void usingLingPipeNER() {
//        usingLingPipeRexExChunker();
        usingExactDictionaryChunker();

    }

    private static void usingLingPipeRexExChunker() {
        try {
            File modelFile = new File(getModelDir(),
                    "ne-en-news-muc6.AbstractCharLmRescoringChunker");
            Chunker chunker
                    = (Chunker) AbstractExternalizable.readObject(modelFile);

            for (String sentence : sentences) {
                displayChunkSet(chunker, sentence);
            }
        } catch (IOException | ClassNotFoundException ex) {
            ex.printStackTrace();
        }
    }

    private static void displayChunkSet(Chunker chunker, String text) {
        Chunking chunking = chunker.chunk(text);
        Set<Chunk> set = chunking.chunkSet();
        for (Chunk chunk : set) {
            System.out.println("Type: " + chunk.type() + " Entity: ["
                    + text.substring(chunk.start(), chunk.end())
                    + "] Score: " + chunk.score());
        }
    }

    private static void initializeDictionary() {
        dictionary = new MapDictionary<String>();
        dictionary.addEntry(
                new DictionaryEntry<String>("Joe", "PERSON", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("Fred", "PERSON", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("Boston", "PLACE", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("pub", "PLACE", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("Vermont", "PLACE", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("IBM", "ORGANIZATION", 1.0));
        dictionary.addEntry(
                new DictionaryEntry<String>("Sally", "PERSON", 1.0));
    }

    private static void usingExactDictionaryChunker() {
        initializeDictionary();
        System.out.println("\nDICTIONARY\n" + dictionary);

        ExactDictionaryChunker dictionaryChunker
                = new ExactDictionaryChunker(dictionary,
                        IndoEuropeanTokenizerFactory.INSTANCE, true, false);

        for (String sentence : sentences) {
            System.out.println("\nTEXT=" + sentence);
            displayChunkSet(dictionaryChunker, sentence);
        }
    }

    // Training Models
    private static void trainingOpenNLPNERModel() {
        try (OutputStream modelOutputStream = new BufferedOutputStream(
                new FileOutputStream(new File("modelFile")));) {
            ObjectStream<String> lineStream = new PlainTextByLineStream(
                    new FileInputStream("en-ner-person.train"), "UTF-8");
            ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

            TokenNameFinderModel model = NameFinderME.train("en", "person", sampleStream,
                    null, 100, 5);

            model.serialize(modelOutputStream);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
}