package uhh_lt.newsleak.writer; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.descriptor.OperationalProperties; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import opennlp.uima.Sentence; import opennlp.uima.Token; import uhh_lt.newsleak.resources.TextLineWriterResource; import uhh_lt.newsleak.types.Metadata; /** * A simple writer for debug and development purposes only. It write fulltexts * and/or extracted entities to disk. * * This writer is not used in any production setting. */ @OperationalProperties(multipleDeploymentAllowed = true, modifiesCas = false) public class TextLineWriter extends JCasAnnotator_ImplBase { /** The sample id hash. */ private HashSet<String> sampleIdHash = new HashSet<String>(); /** The logger. */ Logger logger; /** The lang stats. */ public HashMap<String, String> langStats; /** The Constant RESOURCE_LINEWRITER. */ public static final String RESOURCE_LINEWRITER = "linewriter"; /** The linewriter. */ @ExternalResource(key = RESOURCE_LINEWRITER) private TextLineWriterResource linewriter; /* * (non-Javadoc) * * @see * org.apache.uima.fit.component.JCasAnnotator_ImplBase#initialize(org.apache. * uima.UimaContext) */ @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); langStats = new HashMap<String, String>(); logger = context.getLogger(); // restrict to samples String[] sampleIds = { "9141", "9099", "10779", "6823", "7455", "8078", "9538", "10051", "9660", "10521" }; sampleIdHash.addAll(Arrays.asList(sampleIds)); } /* * (non-Javadoc) * * @see * org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache. * uima.jcas.JCas) */ @Override public void process(JCas jcas) throws AnalysisEngineProcessException { String docText = jcas.getDocumentText(); // Language String outputText = jcas.getDocumentLanguage() + "\t"; // n sentencs Collection<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, 0, jcas.getDocumentText().length()); outputText += sentences.size() + "\t"; // n tokens Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText().length()); outputText += tokens.size() + "\t"; // pos String firstPOS = tokens.iterator().next().getPos(); outputText += firstPOS + "\t"; // text outputText += docText.replaceAll("\n", " "); // linewriter.append(outputText); Metadata metadata = (Metadata) jcas.getAnnotationIndex(Metadata.type).iterator().next(); langStats.put(metadata.getDocId(), jcas.getDocumentLanguage()); if (sampleIdHash.contains(metadata.getDocId())) { int i = 0; for (Sentence s : sentences) { i++; String sOut = metadata.getDocId() + "\t" + i + "\t"; String tOut = ""; for (Token t : JCasUtil.selectCovered(jcas, Token.class, s.getBegin(), s.getEnd())) { tOut += t.getCoveredText() + " "; } sOut += tOut.trim(); linewriter.append(sOut); } } } /* * (non-Javadoc) * * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase# * collectionProcessComplete() */ @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { logger.log(Level.INFO, langStats.toString()); } }