package de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.uima.custom.readers; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.config.settings.DisambiguationSettings; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.graph.similarity.exception.MissingSettingException; import de.mpg.mpi_inf.ambiversenlu.nlu.entitylinking.uima.type.Entity; import de.mpg.mpi_inf.ambiversenlu.nlu.model.Document; import de.mpg.mpi_inf.ambiversenlu.nlu.ner.uima.type.PositionInEntity; import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.ViewCreatorAnnotator; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Scanner; @TypeCapability(outputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", "mpi.aida.uima.type.Entity"}) public class Conll2003AidaReader extends ResourceCollectionReaderBase { /** * Character encoding of the input data. */ public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = false, defaultValue = "UTF-8") private String encoding; /** * The language. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; public static final String PARAM_MANUAL_TOKENS_NER = "evaluateRecognition"; @ConfigurationParameter(name = PARAM_MANUAL_TOKENS_NER, mandatory = false) private boolean evaluateNER; public static final String PARAM_SINGLE_FILE = "isOneFile"; @ConfigurationParameter(name = PARAM_SINGLE_FILE, mandatory = false) private boolean isOneFile; public static final String PARAM_GREEDY = "greedy"; @ConfigurationParameter(name = PARAM_GREEDY, mandatory = false, defaultValue = "false") private boolean greedy; /** * Inclusive */ public static final String PARAM_FIRSTDOCUMENT = "firstDocument"; @ConfigurationParameter(name = PARAM_FIRSTDOCUMENT, mandatory = false) private int begin = 0; /** * Inclusive */ public static final String PARAM_LASTDOCUMENT = "lastDocument"; @ConfigurationParameter(name = PARAM_LASTDOCUMENT, mandatory = false) private int end = Integer.MAX_VALUE; public static final String PARAM_ORDER = "order"; @ConfigurationParameter(name = PARAM_ORDER, defaultValue = "DEFAULT") private OrderType orderType; public static final String PARAM_SENTENCE_END = "sentenceEnd"; @ConfigurationParameter(name = PARAM_SENTENCE_END, mandatory = true, defaultValue = "NEWLINE") private SentenceEndType sentenceEnd; public static final String PARAM_NAMED_ENTITY_PER_TOKEN = "namedEntityPerToken"; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_PER_TOKEN, mandatory = false, defaultValue = "false") private boolean namedEntityPerToken; private List<String> goldStandardList; private boolean goldStandardFlush = false; public enum SentenceEndType {NEWLINE, DOT} private Scanner reader; private String nextDocId; private int current = 0; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { if (begin > end) { throw new ResourceInitializationException(); } super.initialize(aContext); } private void convert(CAS aCAS) throws IOException, CollectionException, AnalysisEngineProcessException, CASException, NoSuchMethodException, MissingSettingException, ClassNotFoundException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } JCasBuilder doc = new JCasBuilder(jcas); JCas goldView = ViewCreatorAnnotator.createViewSafely(jcas, "gold"); List<String[]> words; if (reader.hasNext()) { String docId; // the constrained range over documents is available only when isOneFile = true if (!isOneFile) { docId = parseDocId(reader.nextLine()); } else if (nextDocId == null) { String line = null; while (current <= begin && reader.hasNext() && (line = reader.findWithinHorizon("-DOCSTART- \\(.*\\)", 0)) != null) { current++; } if (current <= begin) { throw new RuntimeException("Begin " + begin + " is out of range of the jcas (" + current + ")"); } docId = parseDocId(line); } else { current++; docId = nextDocId; } DocumentMetaData documentMetaData = JCasUtil.select(jcas, DocumentMetaData.class).iterator().next(); documentMetaData.setDocumentId(docId); documentMetaData.setDocumentTitle(docId); } while ((words = readSentence()) != null) { if (words.isEmpty()) { continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; NamedEntity ne = null; Entity e = null; String prevType = "OTH"; for (String[] word : words) { // this hint is necessary to let crfsuite version 0.12 run properly String word1 = orderType.getWord(word) .replace(":", "/") .replace("\\", "/"); Token token = doc.add(word1, Token.class); sentenceEnd = token.getEnd(); if (language == null || !language.equals("zh")) { doc.add(" "); } token.addToIndexes(); if (orderType == OrderType.DEFAULT) { if(orderType.isEntity(word)) { if (orderType.getPosition(word).equals("B")) { ne = new NamedEntity(jcas); ne.setBegin(token.getBegin()); if(word.length > 4) { ne.setValue(orderType.getType(word)); } ne.setEnd(token.getEnd()); if(!evaluateNER) { ne.addToIndexes(); } e = new Entity(goldView); e.setID(orderType.getEntity(word)); e.setBegin(token.getBegin()); e.setEnd(token.getEnd()); e.addToIndexes(); } else { ne.setEnd(token.getEnd()); e.setEnd(token.getEnd()); } } } else { if (!greedy) { if (orderType.hasLemma()) { Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); lemma.setValue(orderType.getLemma(word)); lemma.addToIndexes(); } if (orderType.hasPOS()) { POS pos = new POS(jcas, token.getBegin(), token.getEnd()); pos.setPosValue(orderType.getPOS(word)); pos.addToIndexes(); } } String type; if (orderType.isEntity(word)) { String position = orderType.getPosition(word); type = orderType.getType(word); PositionInEntity positionInEntity = new PositionInEntity(jcas, token.getBegin(), token.getEnd()); positionInEntity.setPositionInEntity(position); positionInEntity.addToIndexes(); if (position.equals("B") || "OTH".equals(prevType) || !type.equals(prevType)) { ne = new NamedEntity(jcas); ne.setBegin(token.getBegin()); ne.setValue((namedEntityPerToken? position + "-" : "") + type); ne.setEnd(token.getEnd()); if (!evaluateNER) { ne.addToIndexes(); } if (!greedy && orderType.hasEntity()) { e = new Entity(goldView); e.setID(orderType.getEntity(word)); e.setBegin(token.getBegin()); e.setEnd(token.getEnd()); e.addToIndexes(); } } else { if (!greedy && orderType.hasEntity()) { e.setEnd(token.getEnd()); } if (namedEntityPerToken) { ne = new NamedEntity(jcas); ne.setBegin(token.getBegin()); ne.setValue(position + "-" + type); if (!evaluateNER) { ne.addToIndexes(); } } else if (!ne.getValue().equals(type)) { throw new RuntimeException("Wrong chunks order detected! " + Arrays.toString(word)); } ne.setEnd(token.getEnd()); } if (goldStandardList != null) { goldStandardList.add(position + "-" + type); } } else { type = "OTH"; if (goldStandardList != null) { goldStandardList.add(type); } } prevType = type; } } Sentence sentence = new Sentence(jcas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); doc.add("\n"); } doc.close(); if (!greedy && orderType != OrderType.DEFAULT) { Document.Builder dbuilder = new Document.Builder(); dbuilder.withText(jcas.getDocumentText()).withDisambiguationSettings(new DisambiguationSettings.Builder().build()); Document ds = dbuilder.build(); ds.addSettingstoJcas(jcas); } } /** * Read a single sentence. */ private List<String[]> readSentence() throws IOException, CollectionException { if (!reader.hasNextLine()) { return null; } List<String[]> words = new ArrayList<>(); String line; while (reader.hasNextLine()) { line = reader.nextLine(); if (line.contains("DOCSTART")) { if (isOneFile) { nextDocId = parseDocId(line); return null; } else { throw new RuntimeException("There are more than DOCSTART in one document!"); } } if (StringUtils.isBlank(line)) { break; // End of sentence } String[] fields = line.split("\t"); words.add(fields); if (sentenceEnd == SentenceEndType.DOT && ".".equals(fields[0]) && !"dummy".equals(fields[1])) { break; } } return words; } // should be called only if there is DOCSTART in the line private static String parseDocId(String line) { if (line == null || !line.contains("DOCSTART")) { throw new RuntimeException(""); } return line.replaceAll("-DOCSTART- \\(", "").replaceAll("\\)", ""); } @Override public void getNext(CAS aCAS) throws IOException, CollectionException { if (!isOneFile || reader == null) { Resource res = nextFile(); reader = new Scanner(new InputStreamReader(res.getInputStream(), encoding)); } initCas(aCAS, null); try { convert(aCAS); } catch (AnalysisEngineProcessException | NoSuchMethodException | MissingSettingException | ClassNotFoundException e) { throw new RuntimeException(e); } catch (CASException e) { e.printStackTrace(); } } @Override protected void initCas(CAS aCas, Resource aResource) { try { // Set the document metadata DocumentMetaData docMetaData = DocumentMetaData.create(aCas); docMetaData.setLanguage(language); // docMetaData.setDocumentTitle(new File(aResource.getPath()).getName()); // docMetaData.setDocumentUri(aResource.getResolvedUri().toString() + qualifier); // docMetaData.setDocumentId("doc id"); // if (aResource.getBase() != null) { // docMetaData.setDocumentBaseUri(aResource.getResolvedBase()); // docMetaData.setCollectionId(aResource.getResolvedBase()); // } // Set the document language aCas.setDocumentLanguage(language); } catch (CASException e) { // This should not happen. throw new RuntimeException(e); } } @Override public boolean hasNext() throws IOException, CollectionException { if (isOneFile && reader != null && reader.hasNextLine()) { if (end != Integer.MAX_VALUE && current > end) { return false; } return true; } else { reader = null; return super.hasNext(); } } @Override public void destroy() { IOUtils.closeQuietly(reader); } @Override public Progress[] getProgress() { // Auto-generated method stub return null; } }