package ch.epfl.bbp.uima.laucher; import static com.google.common.collect.Lists.newArrayList; import static com.google.common.collect.Sets.newHashSet; import static java.lang.System.currentTimeMillis; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import static org.slf4j.LoggerFactory.getLogger; import java.io.IOException; import java.util.List; import java.util.Set; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.TypeOrFeature; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.collection.metadata.CpeDescriptorException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.metadata.Capability; import org.apache.uima.resource.metadata.ResourceMetaData; import org.apache.uima.util.InvalidXMLException; import org.slf4j.Logger; import org.xml.sax.SAXException; import ch.epfl.bbp.uima.uimafit.CpeBuilder; import ch.epfl.bbp.uima.uimafit.JcasPipelineBuilder; import ch.epfl.bbp.uima.uimafit.PipelineBuilder; /** * Abstraction to encapsulate a {@link CollectionReaderDescription} and several * {@link AnalysisEngineDescription}s, and run them * * @author [email protected] */ public class Pipeline { private static Logger LOG = getLogger(Pipeline.class); CollectionReaderDescription crd = null; List<AnalysisEngineDescription> aeds = newArrayList(); private int threads = 2; private int maxErrors = 0; /** * The output types (Annotation class names) outputted by this pipeline, as * described in the @TypeCapability of {@link AnalysisEngine}s */ List<String> outputTypes = newArrayList(); public CollectionReaderDescription getCr() { return crd; } public void setCr(CollectionReaderDescription crd) { this.crd = crd; addCapabilities(crd); } public void addAe(AnalysisEngineDescription description) { aeds.add(description); checkAndAddCapabilities(description); } public void add(Class<? extends JCasAnnotator_ImplBase> annotatorClass, Object... configurationData) throws InvalidXMLException, ResourceInitializationException, IOException, SAXException, CpeDescriptorException { addAe(createEngineDescription(annotatorClass, configurationData)); } /** Add Aes from @param p to the current pipeline */ public void addAesFrom(Pipeline p) { for (AnalysisEngineDescription aed : p.getAeds()) { addAe(aed); } } public List<AnalysisEngineDescription> getAeds() { return aeds; } public JcasPipelineBuilder getJCasBuilder() throws UIMAException, IOException, SAXException, CpeDescriptorException { return (JcasPipelineBuilder) build(new JcasPipelineBuilder()); } public void run() throws UIMAException, IOException { if (threads == 1) { // some scripts (RUTA) only work with 1 thread ATM (because of // relative paths to typesystem) ... long start = currentTimeMillis(); aeds.add(createEngineDescription(PrintNrDocs.class)); // stats SimplePipeline.runPipeline(crd, aeds.toArray(new AnalysisEngineDescription[aeds.size()])); LOG.info("Processing took " + (currentTimeMillis() - start) / 1000 + "s in total"); } else { try { CpeBuilder cpeBuilder = (CpeBuilder) build(new CpeBuilder( this.crd)// .setMaxProcessingUnitThreatCount(threads)// .setMaxErrors(maxErrors)); cpeBuilder.process(); } catch (Exception e) { throw new UIMAException(e); } } } public static class PrintNrDocs extends JCasAnnotator_ImplBase { private int nrDocs = 0; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { nrDocs++; } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { LOG.info("Processed " + nrDocs + " documents"); } } public void run(JCas jCas) throws UIMAException, IOException { try { JcasPipelineBuilder builder = new JcasPipelineBuilder(jCas); build(builder); builder.process(); } catch (Exception e) { throw new UIMAException(e); } } private PipelineBuilder build(PipelineBuilder builder) throws InvalidXMLException, IOException, SAXException, CpeDescriptorException { Set<String> cpeNames = newHashSet(); for (AnalysisEngineDescription aed : this.aeds) { // workaround to use multiple cpe with the same name String name = aed.getMetaData().getName(); if (cpeNames.contains(name)) { ResourceMetaData metaData = aed.getMetaData(); String newName = name + System.currentTimeMillis(); metaData.setName(newName); aed.setMetaData(metaData); cpeNames.add(newName); } else { cpeNames.add(name); } builder.add(aed); } return builder; } public Pipeline setThreads(int threads) { this.threads = threads; return this; } public Pipeline setMaxErrors(int maxErrors) { this.maxErrors = maxErrors; return this; } /** Add this crd's capabilities for other downstream aeds. */ private void addCapabilities(CollectionReaderDescription crd) { for (Capability capability : crd.getCollectionReaderMetaData() .getCapabilities()) { for (TypeOrFeature output : capability.getOutputs()) { // LOG.info("add @TypeCapability: " + output.getName()); outputTypes.add(output.getName()); } } } /** * Checks that this @param aed is provided with the right {@link Annotation} * s in the upstream of this pipeline, and prints an error log otherwise. * Add this aed's capabilities for other downstream aeds. */ private void checkAndAddCapabilities(AnalysisEngineDescription aed) { for (Capability capability : aed.getAnalysisEngineMetaData() .getCapabilities()) { for (TypeOrFeature input : capability.getInputs()) if (!outputTypes.contains(input.getName())) LOG.error("AnalysisEngine " + aed.getAnnotatorImplementationName() + " is missing input @TypeCapability: " + input.getName()); } for (Capability capability : aed.getAnalysisEngineMetaData() .getCapabilities()) { for (TypeOrFeature output : capability.getOutputs()) { // LOG.info("add @TypeCapability: " + output.getName()); outputTypes.add(output.getName()); } } } public List<String> getOutputTypes() { return outputTypes; } public void addAesTo(PipelineBuilder builder) throws InvalidXMLException, IOException, SAXException, CpeDescriptorException { for (AnalysisEngineDescription aDesc : getAeds()) { builder.add(aDesc); } } }