/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package it.cnr.iac; import org.apache.ctakes.assertion.medfacts.cleartk.*; import org.apache.ctakes.chunker.ae.Chunker; import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster; import org.apache.ctakes.constituency.parser.ae.ConstituencyParser; import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator; import org.apache.ctakes.core.ae.SentenceDetector; import org.apache.ctakes.core.ae.SimpleSegmentAnnotator; import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB; import org.apache.ctakes.core.resource.FileLocator; import org.apache.ctakes.core.resource.FileResourceImpl; import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE; import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator; import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator; import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator; import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator; import org.apache.ctakes.lvg.ae.LvgAnnotator; import org.apache.ctakes.postagger.POSTagger; import org.apache.ctakes.typesystem.type.constants.CONST; import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; import org.apache.ctakes.typesystem.type.syntax.Chunk; import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.impl.XmiCasSerializer; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.factory.AggregateBuilder; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.XMLSerializer; import org.xml.sax.SAXException; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.OutputStream; import java.util.*; /** * This class runs the complete pipeline of Apache cTAKES for annotating * clinical documents in plain text format. * */ final public class CTAKESClinicalPipelineFactory { public static AnalysisEngineDescription getDefaultPipeline() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(getTokenProcessingPipeline()); builder.add(getNpChunkerPipeline()); builder.add(AnalysisEngineFactory .createEngineDescription(ConstituencyParser.class)); builder.add(UmlsDictionaryLookupAnnotator.createAnnotatorDescription()); builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription()); builder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(UncertaintyCleartkAnalysisEngine .createAnnotatorDescription()); builder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(ConditionalCleartkAnalysisEngine .createAnnotatorDescription()); builder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription()); return builder.createAggregateDescription(); } public static AnalysisEngineDescription getFastPipeline() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(getTokenProcessingPipeline()); try { builder.add(AnalysisEngineFactory .createEngineDescription( DefaultJCasTermAnnotator.class, AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP, "org.apache.ctakes.typesystem.type.textspan.Sentence", JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY, ExternalResourceFactory.createExternalResourceDescription( FileResourceImpl.class, FileLocator .locateFile("org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml")))); } catch (FileNotFoundException e) { e.printStackTrace(); throw new ResourceInitializationException(e); } builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription()); builder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(UncertaintyCleartkAnalysisEngine .createAnnotatorDescription()); builder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(ConditionalCleartkAnalysisEngine .createAnnotatorDescription()); builder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription()); builder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription()); return builder.createAggregateDescription(); } public static AnalysisEngineDescription getParsingPipeline() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(getTokenProcessingPipeline()); builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription()); builder.add(AnalysisEngineFactory .createEngineDescription(ConstituencyParser.class)); return builder.createAggregateDescription(); } public static AnalysisEngineDescription getTokenProcessingPipeline() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(SimpleSegmentAnnotator.createAnnotatorDescription()); builder.add(SentenceDetector.createAnnotatorDescription()); builder.add(TokenizerAnnotatorPTB.createAnnotatorDescription()); builder.add(LvgAnnotator.createAnnotatorDescription()); builder.add(ContextDependentTokenizerAnnotator .createAnnotatorDescription()); builder.add(POSTagger.createAnnotatorDescription()); return builder.createAggregateDescription(); } public static AnalysisEngineDescription getNpChunkerPipeline() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); builder.add(Chunker.createAnnotatorDescription()); builder.add(getStandardChunkAdjusterAnnotator()); builder.add(AnalysisEngineFactory .createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class)); builder.add(AnalysisEngineFactory .createEngineDescription(RemoveEnclosedLookupWindows.class)); return builder.createAggregateDescription(); } public static AnalysisEngineDescription getStandardChunkAdjusterAnnotator() throws ResourceInitializationException { AggregateBuilder builder = new AggregateBuilder(); // adjust NP in NP NP to span both builder.add(ChunkAdjuster.createAnnotatorDescription(new String[] { "NP", "NP" }, 1)); // adjust NP in NP PP NP to span all three builder.add(ChunkAdjuster.createAnnotatorDescription(new String[] { "NP", "PP", "NP" }, 2)); return builder.createAggregateDescription(); } private static AnalysisEngineDescription getPipelineForProperty(CTAKESClinicalPipelineEnum pipeline) throws ResourceInitializationException { switch (pipeline) { case DEFAULT: return getDefaultPipeline(); case FAST: return getFastPipeline(); case PARSING: return getParsingPipeline(); case TOKEN_PROCESSING: return getTokenProcessingPipeline(); case NP_CHUNKER: return getStandardChunkAdjusterAnnotator(); } return getDefaultPipeline(); } public static void main(final String... args) throws IOException, UIMAException, SAXException { if (args.length < 2) { System.err.println("Usage: " + CTAKESClinicalPipelineFactory.class.getName() + " /path/to/input /path/to/output"); System.exit(1); } String input = args[0]; String output = args[1]; File inputFile = new File(input); File outputFile = new File(output); if (!inputFile.isFile() || !inputFile.canRead()) { System.err.println("Error: " + input + " is not a file or cannot be read!"); System.exit(1); } final JCas jcas = JCasFactory.createJCas(); String note = readFile(inputFile); jcas.setDocumentText(note); //final AnalysisEngineDescription aed = getDefaultPipeline(); Properties properties = new Properties(); properties.load(CTAKESClinicalPipelineFactory.class.getClassLoader().getResourceAsStream("ctakes.properties")); String pipelineType = properties.getProperty("ctakes.pipeline", "DEFAULT"); final AnalysisEngineDescription aed = getPipelineForProperty(CTAKESClinicalPipelineEnum.valueOf(pipelineType)); // Outputs from default and fast pipelines are identical // final AnalysisEngineDescription aed = getFastPipeline(); SimplePipeline.runPipeline(jcas, aed); serialize(jcas, outputFile); final boolean printCuis = Arrays.asList(args).contains("cuis"); final Collection<String> codes = new ArrayList<>(); for (IdentifiedAnnotation entity : JCasUtil.select(jcas, IdentifiedAnnotation.class)) { System.out.println("Entity: " + entity.getCoveredText() + " === Polarity: " + entity.getPolarity() + " === Uncertain? " + (entity.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT) + " === Subject: " + entity.getSubject() + " === Generic? " + (entity.getGeneric() == CONST.NE_GENERIC_TRUE) + " === Conditional? " + (entity.getConditional() == CONST.NE_CONDITIONAL_TRUE) + " === History? " + (entity.getHistoryOf() == CONST.NE_HISTORY_OF_PRESENT)); if (printCuis) { codes.clear(); codes.addAll(getCUIs(entity)); for (String cui : codes) { System.out.print(cui + " "); } System.out.println(); } } } /** * @param identifiedAnnotation * - * @return list of all cuis */ static private Collection<String> getCUIs( final IdentifiedAnnotation identifiedAnnotation) { final FSArray fsArray = identifiedAnnotation.getOntologyConceptArr(); if (fsArray == null) { return Collections.emptySet(); } final FeatureStructure[] featureStructures = fsArray.toArray(); final Collection<String> cuis = new ArrayList<>( featureStructures.length); for (FeatureStructure featureStructure : featureStructures) { if (featureStructure instanceof UmlsConcept) { final UmlsConcept umlsConcept = (UmlsConcept) featureStructure; final String cui = umlsConcept.getCui(); final String tui = umlsConcept.getTui(); if (tui != null && !tui.isEmpty()) { cuis.add(cui + "_" + tui); } else { cuis.add(cui); } } } return cuis; } public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { @Override public void process(JCas jCas) throws AnalysisEngineProcessException { for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) { if (chunk.getChunkType().equals("NP")) { new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes(); } } } } public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase { @Override public void process(JCas jCas) throws AnalysisEngineProcessException { List<LookupWindowAnnotation> lws = new ArrayList<>(JCasUtil.select( jCas, LookupWindowAnnotation.class)); // we'll navigate backwards so that as we delete things we shorten // the list from the back for (int i = lws.size() - 2; i >= 0; i--) { LookupWindowAnnotation lw1 = lws.get(i); LookupWindowAnnotation lw2 = lws.get(i + 1); if (lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()) { // / lw1 envelops or encloses lw2 lws.remove(i + 1); lw2.removeFromIndexes(); } } } } private static String readFile(File file) throws IOException { BufferedReader br = new BufferedReader(new FileReader(file)); try { StringBuilder sb = new StringBuilder(); String line = br.readLine(); while (line != null) { sb.append(line); sb.append("\n"); line = br.readLine(); } return sb.toString(); } finally { br.close(); } } private static void serialize(JCas jcas, File file) throws SAXException, IOException { OutputStream outputStream = null; try { outputStream = new BufferedOutputStream(new FileOutputStream(file)); XmiCasSerializer xmiSerializer = new XmiCasSerializer( jcas.getTypeSystem()); XMLSerializer xmlSerializer = new XMLSerializer(outputStream, true); xmiSerializer.serialize(jcas.getCas(), xmlSerializer.getContentHandler()); } catch (FileNotFoundException fnfe) { throw new FileNotFoundException(fnfe.getMessage()); } catch (SAXException saxe) { throw new SAXException(saxe.getMessage()); } finally { try { outputStream.close(); } catch (IOException ioe) { throw new IOException(ioe.getMessage()); } } } }