package com.bericotech.clavin.nerd; import com.bericotech.clavin.ClavinException; import com.bericotech.clavin.GeoParser; import com.bericotech.clavin.GeoParserFactory; import com.bericotech.clavin.extractor.LocationOccurrence; import com.bericotech.clavin.gazetteer.query.LuceneGazetteer; import com.bericotech.clavin.resolver.ClavinLocationResolver; import com.bericotech.clavin.resolver.ResolvedLocation; import com.bericotech.clavin.util.TextUtils; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Triple; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.List; import java.util.Properties; import static com.bericotech.clavin.nerd.StanfordExtractor.convertNERtoCLAVIN; /*##################################################################### * * CLAVIN-NERD * ----------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. * * ==================================================================== * * WorkflowDemoNERD.java * *###################################################################*/ /** * Quick example showing how to use CLAVIN's capabilities. * */ public class WorkflowDemoNERD { /** * Run this after installing and configuring CLAVIN to get a sense of * how to use it in a few different ways. * * @param args not used * @throws Exception Error by contract */ public static void main(String[] args) throws Exception { getparseArticle(); geoparseUppercaseArticle(); resolveStanfordEntities(); // And we're done... System.out.println("\n\"That's all folks!\""); } /** * Standard usage of CLAVIN. Instantiate a default GeoParser, give * it some text, check out the locations it extracts and resolves. * * @throws Exception */ private static void getparseArticle() throws Exception { // Instantiate a CLAVIN GeoParser using the StanfordExtractor GeoParser parser = GeoParserFactory.getDefault("./IndexDirectory", new StanfordExtractor(), 1, 1, false); // Unstructured text file about Somalia to be geoparsed File inputFile = new File("src/test/resources/sample-docs/Somalia-doc.txt"); // Grab the contents of the text file as a String String inputString = TextUtils.fileToString(inputFile); // Parse location names in the text into geographic entities List<ResolvedLocation> resolvedLocations = parser.parse(inputString); // Display the ResolvedLocations found for the location names for (ResolvedLocation resolvedLocation : resolvedLocations) System.out.println(resolvedLocation); } /** * Demonstrates usage of CLAVIN with non-default NER model -- in * this instance, a case-insensitive model to help us perform * geoparsing on a text document IN ALL CAPS. * * @throws Exception */ private static void geoparseUppercaseArticle() throws Exception { // Instantiate a CLAVIN GeoParser using the StanfordExtractor with "caseless" models GeoParser parser = GeoParserFactory.getDefault("./IndexDirectory", new StanfordExtractor("english.all.3class.caseless.distsim.crf.ser.gz", "english.all.3class.caseless.distsim.prop"), 1, 1, false); // Unstructured uppercase text file about Somalia to be geoparsed File inputFile = new File("src/test/resources/sample-docs/Somalia-doc-uppercase.txt"); // Grab the contents of the text file as a String String inputString = TextUtils.fileToString(inputFile); // Parse location names in the text into geographic entities List<ResolvedLocation> resolvedLocations = parser.parse(inputString); // Display the ResolvedLocations found for the location names for (ResolvedLocation resolvedLocation : resolvedLocations) System.out.println(resolvedLocation); } /** * Sometimes, you might already be using Stanford NER elsewhere in * your application, and you'd like to just pass the output from * Stanford NER directly into CLAVIN, without having to re-run the * input through Stanford NER just to use CLAVIN. This example * shows you how to very easily do exactly that. * * @throws IOException * @throws ClavinException */ private static void resolveStanfordEntities() throws IOException, ClavinException { /*##################################################################### * * Start with Stanford NER -- no need to get CLAVIN involved for now. * *###################################################################*/ // instantiate Stanford NER entity extractor InputStream mpis = WorkflowDemoNERD.class.getClassLoader().getResourceAsStream("models/english.all.3class.distsim.prop"); Properties mp = new Properties(); mp.load(mpis); AbstractSequenceClassifier<CoreMap> namedEntityRecognizer = CRFClassifier.getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp); // Unstructured text file about Somalia to be geoparsed File inputFile = new File("src/test/resources/sample-docs/Somalia-doc.txt"); // Grab the contents of the text file as a String String inputString = TextUtils.fileToString(inputFile); // extract entities from input text using Stanford NER List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer.classifyToCharacterOffsets(inputString); /*##################################################################### * * Now, CLAVIN comes into play... * *###################################################################*/ // convert Stanford NER output to ClavinLocationResolver input List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, inputString); // instantiate the CLAVIN location resolver ClavinLocationResolver clavinLocationResolver = new ClavinLocationResolver(new LuceneGazetteer(new File("./IndexDirectory"))); // resolve location entities extracted from input text List<ResolvedLocation> resolvedLocations = clavinLocationResolver.resolveLocations(locationsForCLAVIN, 1, 1, false); // Display the ResolvedLocations found for the location names for (ResolvedLocation resolvedLocation : resolvedLocations) System.out.println(resolvedLocation); } }