de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData Java Examples

The following examples show how to use de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CasMergeSuiteTest.java    From webanno with Apache License 2.0 8 votes vote down vote up
private void writeAndAssertEquals(JCas curatorCas)
    throws Exception
{
    String targetFolder = "target/test-output/" + testContext.getClassName() + "/"
            + referenceFolder.getName();
    
    DocumentMetaData dmd = DocumentMetaData.get(curatorCas);
    dmd.setDocumentId("curator");
    runPipeline(curatorCas, createEngineDescription(WebannoTsv3XWriter.class,
            WebannoTsv3XWriter.PARAM_TARGET_LOCATION, targetFolder,
            WebannoTsv3XWriter.PARAM_OVERWRITE, true));
    
    File referenceFile = new File(referenceFolder, "curator.tsv");
    assumeTrue("No reference data available for this test.", referenceFile.exists());
    
    File actualFile = new File(targetFolder, "curator.tsv");
    
    String reference = FileUtils.readFileToString(referenceFile, "UTF-8");
    String actual = FileUtils.readFileToString(actualFile, "UTF-8");
    
    assertEquals(reference, actual);
}
 
Example #2
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnnotationWithLeadingWhitespaceAtStart() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText(" one two");
    new Token(jcas, 1, 4).addToIndexes();
    new Token(jcas, 5, 8).addToIndexes();
    new Sentence(jcas, 1, 8).addToIndexes();
    
    // NE has leading whitespace - on export this should be silently dropped
    new NamedEntity(jcas, 0, 4).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #3
Source File: CasPersistenceUtils.java    From webanno with Apache License 2.0 6 votes vote down vote up
public static void readSerializedCas(CAS aCas, File aFile)
    throws IOException
{
    CAS realCas = getRealCas(aCas);
    // UIMA-6162 Workaround: synchronize CAS during de/serialization
    synchronized (((CASImpl) realCas).getBaseCAS()) {
        try (ObjectInputStream is = new ObjectInputStream(new FileInputStream(aFile))) {
            CASCompleteSerializer serializer = (CASCompleteSerializer) is.readObject();
            deserializeCASComplete(serializer, (CASImpl) realCas);
            
            // Workaround for UIMA adding back deleted DocumentAnnotations
            // https://issues.apache.org/jira/browse/UIMA-6199
            // If there is a DocumentMetaData annotation, then we can drop any of the default
            // UIMA DocumentAnnotation instances (excluding the DocumentMetaData of course)
            if (!aCas.select(DocumentMetaData.class.getName()).isEmpty()) {
                aCas.select(CAS.TYPE_NAME_DOCUMENT_ANNOTATION)
                    .filter(fs -> !DocumentMetaData.class.getName().equals(
                            fs.getType().getName()))
                    .forEach(aCas::removeFsFromIndexes);
            }
        }
        catch (ClassNotFoundException e) {
            throw new IOException(e);
        }
    }
}
 
Example #4
Source File: ConstraintsGeneratorTest.java    From webanno with Apache License 2.0 6 votes vote down vote up
private JCas makeJCasOneSentence() throws UIMAException
{
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory
            .createTypeSystemDescriptionFromPath(
                    "src/test/resources/desc/types/webannoTestTypes.xml");
   
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    
    JCas jcas = JCasFactory.createJCas(merged);
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class,
            Sentence.class);
    tb.buildTokens(jcas, "This is a test .");
    
    return jcas;
}
 
Example #5
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testElevatedType() throws Exception {
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("John");
    
    // Add an elevated type which is not a direct subtype of Annotation. This type not be picked
    // up by the schema analyzer but should still be serialized as the POS type which is in fact
    // picked up.
    POS_NOUN pos = new POS_NOUN(jcas, 0, 4);
    pos.setPosValue("NN");
    pos.setCoarseValue("NOUN");
    pos.addToIndexes();
    
    Token t = new Token(jcas, 0, 4);
    t.setPos(pos);
    t.addToIndexes();
    new Sentence(jcas, 0, 4).addToIndexes();
            
    writeAndAssertEquals(jcas);
}
 
Example #6
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroWidthAnnotationBeforeFirstTokenIsMovedToBeginOfFirstToken() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("  one two");
    new Token(jcas, 2, 5).addToIndexes();
    new Token(jcas, 6, 9).addToIndexes();
    new Sentence(jcas, 2, 9).addToIndexes();
    
    // NE is after the end of the last token and should be moved to the end of the last token
    // otherwise it could not be represented in the TSV3 format.
    new NamedEntity(jcas, 1, 1).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #7
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroWidthAnnotationBeyondLastTokenIsMovedToEndOfLastToken() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("one two  ");
    new Token(jcas, 0, 3).addToIndexes();
    new Token(jcas, 4, 7).addToIndexes();
    new Sentence(jcas, 0, 7).addToIndexes();
    
    // NE is after the end of the last token and should be moved to the end of the last token
    // otherwise it could not be represented in the TSV3 format.
    new NamedEntity(jcas, 8, 8).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #8
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroWidthAnnotationBetweenTokenIsMovedToEndOfPreviousToken() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("one  two");
    new Token(jcas, 0, 3).addToIndexes();
    new Token(jcas, 5, 8).addToIndexes();
    new Sentence(jcas, 0, 8).addToIndexes();
    
    // NE is after the end of the last token and should be moved to the end of the last token
    // otherwise it could not be represented in the TSV3 format.
    new NamedEntity(jcas, 4, 4).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #9
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnnotationWithLeadingWhitespace() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("one  two");
    new Token(jcas, 0, 3).addToIndexes();
    new Token(jcas, 5, 8).addToIndexes();
    new Sentence(jcas, 0, 8).addToIndexes();
    
    // NE has leading whitespace - on export this should be silently dropped
    new NamedEntity(jcas, 4, 8).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #10
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnnotationWithTrailingWhitespaceAtEnd() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("one two ");
    new Token(jcas, 0, 3).addToIndexes();
    new Token(jcas, 4, 7).addToIndexes();
    new Sentence(jcas, 0, 7).addToIndexes();
    
    // NE has trailing whitespace - on export this should be silently dropped
    new NamedEntity(jcas, 4, 8).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #11
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testAnnotationWithTrailingWhitespace() throws Exception
{
    JCas jcas = JCasFactory.createJCas();
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("one  two");
    new Token(jcas, 0, 3).addToIndexes();
    new Token(jcas, 5, 8).addToIndexes();
    new Sentence(jcas, 0, 8).addToIndexes();
    
    // NE has trailing whitespace - on export this should be silently dropped
    new NamedEntity(jcas, 0, 4).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #12
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testTwoSentencesWithNoSpaceInBetween() throws Exception
{
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory
            .createTypeSystemDescriptionFromPath(
                    "src/test/resources/desc/type/webannoTestTypes.xml");
   
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    
    JCas jcas = JCasFactory.createJCas(merged);
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("onetwo");
    new Token(jcas, 0, 3).addToIndexes();
    new Sentence(jcas, 0, 3).addToIndexes();
    new Token(jcas, 3, 6).addToIndexes();
    new Sentence(jcas, 3, 6).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example #13
Source File: Conll2003AidaReader.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
@Override
    protected void initCas(CAS aCas, Resource aResource) {
        try {
            // Set the document metadata
            DocumentMetaData docMetaData = DocumentMetaData.create(aCas);
            docMetaData.setLanguage(language);
//      docMetaData.setDocumentTitle(new File(aResource.getPath()).getName());
//      docMetaData.setDocumentUri(aResource.getResolvedUri().toString() + qualifier);
//      docMetaData.setDocumentId("doc id");
//      if (aResource.getBase() != null) {
//        docMetaData.setDocumentBaseUri(aResource.getResolvedBase());
//        docMetaData.setCollectionId(aResource.getResolvedBase());
//      }

            // Set the document language
            aCas.setDocumentLanguage(language);
        } catch (CASException e) {
            // This should not happen.
            throw new RuntimeException(e);
        }
    }
 
Example #14
Source File: CompressedXmiWriter.java    From argument-reasoning-comprehension-task with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas aJCas)
        throws AnalysisEngineProcessException
{
    try {
        java.io.ByteArrayOutputStream jCasOutputStream = new java.io.ByteArrayOutputStream();
        XmiCasSerializer.serialize(aJCas.getCas(), jCasOutputStream);

        // get name = id + .xmi
        String singleEntryName = DocumentMetaData.get(aJCas).getDocumentId() + ".xmi";
        // convert output stream to input stream
        //            InputStream inputStream = new ByteArrayInputStream(jCasOutputStream.toByteArray());

        // add to the tar
        addSingleEntryToTar(jCasOutputStream.toByteArray(), singleEntryName);

        if (!typeSystemWritten) {
            writeTypeSystem(aJCas);
            typeSystemWritten = true;
        }

        counter++;
    }
    catch (IOException | SAXException ex) {
        throw new AnalysisEngineProcessException(ex);
    }
}
 
Example #15
Source File: CasMerge.java    From webanno with Apache License 2.0 5 votes vote down vote up
private static void clearAnnotations(CAS aCas)
    throws UIMAException
{
    CAS backup = CasFactory.createCas((TypeSystemDescription) null);
    
    // Copy the CAS - basically we do this just to keep the full type system information
    CASCompleteSerializer serializer = serializeCASComplete((CASImpl) getRealCas(aCas));
    deserializeCASComplete(serializer, (CASImpl) getRealCas(backup));

    // Remove all annotations from the target CAS but we keep the type system!
    aCas.reset();
    
    // Copy over essential information
    if (exists(backup, getType(backup, DocumentMetaData.class))) {
        copyDocumentMetadata(backup, aCas);
    }
    else {
        WebAnnoCasUtil.createDocumentMetadata(aCas);
    }
    aCas.setDocumentLanguage(backup.getDocumentLanguage()); // DKPro Core Issue 435
    aCas.setDocumentText(backup.getDocumentText());
    
    // Transfer token boundaries
    for (AnnotationFS t : selectTokens(backup)) {
        aCas.addFsToIndexes(createToken(aCas, t.getBegin(), t.getEnd()));
    }

    // Transfer sentence boundaries
    for (AnnotationFS s : selectSentences(backup)) {
        aCas.addFsToIndexes(createSentence(aCas, s.getBegin(), s.getEnd()));
    }
}
 
Example #16
Source File: SparkUimaUtils.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public static void createSequenceFile(Object[] params, String uri)
    throws URISyntaxException, IOException, UIMAException, NoSuchMethodException, MissingSettingException, ClassNotFoundException {
  Configuration conf = new Configuration();
  Path path = new Path(uri);
  Writer writer =
      SequenceFile.createWriter(
          conf, Writer.file(path),
          Writer.keyClass(Text.class),
          Writer.valueClass(SCAS.class));

  int count = 0;

  CollectionReaderDescription readerDescription = Reader.getCollectionReaderDescription(Reader.COLLECTION_FORMAT.NYT, params);
  for (JCas jCas : SimplePipelineCasPoolIterator.iteratePipeline(20, readerDescription)) {
      if(JCasUtil.exists(jCas, DocumentMetaData.class)) {
        ++count;
        // Get the ID.
        DocumentMetaData dmd = JCasUtil.selectSingle(jCas, DocumentMetaData.class);
        String docId = "NULL";
        if (dmd != null) {
          docId = dmd.getDocumentId();
        } else {
          throw new IOException("No Document ID for xml: " + jCas.getView("xml").getDocumentText());
        }
        Text docIdText = new Text(docId);
        SCAS scas = new SCAS(jCas.getCas());
        writer.append(docIdText, scas);
      }
      jCas.release();
  }
  logger.info("Wrote " + count + " documents to " + uri);
  IOUtils.closeStream(writer);
}
 
Example #17
Source File: TeiReaderTest.java    From webanno with Apache License 2.0 5 votes vote down vote up
@Test
 @Ignore("No TEI yet to opensource ")
public void testTeiReader()
    throws Exception
{
    CollectionReaderDescription reader = createReaderDescription(TeiReader.class,
            TeiReader.PARAM_LANGUAGE, "en", TeiReader.PARAM_SOURCE_LOCATION,
            "classpath:/local/", TeiReader.PARAM_PATTERNS, new String[] { "[+]*.xml" });

    String firstSentence = "70 I DAG.";

    for (JCas jcas : new JCasIterable(reader)) {
        DocumentMetaData meta = DocumentMetaData.get(jcas);
        String text = jcas.getDocumentText();
        System.out.printf("%s - %d%n", meta.getDocumentId(), text.length());
        System.out.println(jcas.getDocumentLanguage());

        assertEquals(2235, JCasUtil.select(jcas, Token.class).size());
        assertEquals(745, JCasUtil.select(jcas, POS.class).size());
        assertEquals(745, JCasUtil.select(jcas, Lemma.class).size());
        assertEquals(0, JCasUtil.select(jcas, NamedEntity.class).size());
        assertEquals(30, JCasUtil.select(jcas, Sentence.class).size());

        assertEquals(firstSentence, JCasUtil.select(jcas, Sentence.class).iterator().next()
                .getCoveredText());
    }

}
 
Example #18
Source File: NYTEntitySalienceFeatureExtractor.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public List<TrainingInstance> getTrainingInstances(JCas jCas, TrainingSettings.FeatureExtractor featureExtractor, int positiveInstanceScalingFactor) throws Exception {
  List<TrainingInstance> trainingInstances = new ArrayList<>();

  Collection<SalientEntity> salientEntities = JCasUtil.select(jCas, SalientEntity.class);
  Map<String, SalientEntity> salientEntityMap = new HashMap<>();

  //The salient entities at this point don't have IDs. ITs better if we find the ids from the Aida Entities
  for(SalientEntity salientEntity : salientEntities) {
      salientEntityMap.put(salientEntity.getID(), salientEntity);
  }

  Logger logger = LoggerFactory.getLogger(NYTEntitySalienceFeatureExtractor.class);
  String docId = JCasUtil.selectSingle(jCas, DocumentMetaData.class).getDocumentId();
  logger.info("[{}] Document entities: {}.", docId,  salientEntityMap.size());

  List<EntityInstance> entityInstances = getEntityInstances(jCas, featureExtractor);

  // Extract features for entities.
  for (EntityInstance ei : entityInstances) {
    String entityId = ei.getEntityId();
    if(salientEntityMap.containsKey(entityId)) {
      Double label = salientEntityMap.get(entityId).getLabel();

      // Generate the training instance with boolean label.
      TrainingInstance ti = new TrainingInstance(label, ei.getFeatureValues(), entityId, docId);
      logger.debug("[{}] for entity {} ti: {}.", docId, entityId, ti);
      trainingInstances.add(ti);

      // Scale positive examples if necessary.
      int addCount = (label == 1.0) ? positiveInstanceScalingFactor : 1;
      for (int i = 1; i < addCount; ++i) {
        trainingInstances.add(ti);
      }
    }
  }

  return trainingInstances;
}
 
Example #19
Source File: WebAnnoCasUtilTest.java    From webanno with Apache License 2.0 5 votes vote down vote up
@Test
public void thatCreateDocumentMetadataUpgradesExistingDocumentAnnotation() throws Exception
{
    TypeSystemDescription tsd = createTypeSystemDescription();
    
    CAS cas = getRealCas(createCas(tsd));
    
    assertThat(cas.select(DocumentAnnotation.class).asList())
            .as("CAS has no DocumentAnnotation")
            .isEmpty();
    
    cas.setDocumentLanguage("en");
    
    assertThat(cas.select(DocumentAnnotation.class).asList())
            .as("CAS initialized with DocumentAnnotation")
            .extracting(fs -> fs.getType().getName())
            .containsExactly(TYPE_NAME_DOCUMENT_ANNOTATION);
    assertThat(cas.select(DocumentAnnotation.class).asList())
            .as("Language has been set")
            .extracting(DocumentAnnotation::getLanguage)
            .containsExactly("en");

    WebAnnoCasUtil.createDocumentMetadata(cas);

    assertThat(cas.select(DocumentAnnotation.class).asList())
            .as("DocumentAnnotation has been upgraded to DocumentMetaData")
            .extracting(fs -> fs.getType().getName())
            .containsExactly(DocumentMetaData.class.getName());
    assertThat(cas.select(DocumentAnnotation.class).asList())
            .as("Language survived upgrade")
            .extracting(DocumentAnnotation::getLanguage)
            .containsExactly("en");
}
 
Example #20
Source File: WebAnnoCasUtil.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static String getDocumentTitle(CAS aCas)
{
    try {
        Type type = getType(aCas, DocumentMetaData.class);
        FeatureStructure dmd = selectSingle(aCas, type);
        return FSUtil.getFeature(dmd, "documentTitle", String.class);
    }
    catch (IllegalArgumentException e) {
        return null;
    }
}
 
Example #21
Source File: WebAnnoCasUtil.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static String getDocumentUri(CAS aCas)
{
    try {
        Type type = getType(aCas, DocumentMetaData.class);
        FeatureStructure dmd = selectSingle(aCas, type);
        return FSUtil.getFeature(dmd, "documentUri", String.class);
    }
    catch (IllegalArgumentException e) {
        return null;
    }
}
 
Example #22
Source File: WebAnnoCasUtil.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static String getDocumentId(CAS aCas)
{
    try {
        Type type = getType(aCas, DocumentMetaData.class);
        FeatureStructure dmd = selectSingle(aCas, type);
        return FSUtil.getFeature(dmd, "documentId", String.class);
    }
    catch (IllegalArgumentException e) {
        return null;
    }
}
 
Example #23
Source File: WebAnnoCasUtil.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static FeatureStructure getDocumentMetadata(CAS aCas)
{
    Type type = getType(aCas, DocumentMetaData.class);
    FeatureStructure dmd;
    try {
        dmd = selectSingle(aCas, type);
    }
    catch (IllegalArgumentException e) {
        dmd = createDocumentMetadata(aCas);
    }
    
    return dmd;
}
 
Example #24
Source File: WebAnnoCasUtil.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static FeatureStructure createDocumentMetadata(CAS aCas)
{
    Type type = getType(aCas, DocumentMetaData.class);
    FeatureStructure dmd;
    if (aCas.getDocumentText() != null) {
        dmd = aCas.createAnnotation(type, 0, aCas.getDocumentText().length());
    }
    else {
        dmd = aCas.createAnnotation(type, 0, 0);
    }
    
    // If there is already a DocumentAnnotation copy it's information and delete it
    FeatureStructure da = aCas.getDocumentAnnotation();
    if (da != null) {
        FSUtil.setFeature(dmd, FEATURE_BASE_NAME_LANGUAGE,
                FSUtil.getFeature(da, FEATURE_BASE_NAME_LANGUAGE, String.class));
        FSUtil.setFeature(dmd, FEATURE_BASE_NAME_BEGIN,
                FSUtil.getFeature(da, FEATURE_BASE_NAME_BEGIN, Integer.class));
        FSUtil.setFeature(dmd, FEATURE_BASE_NAME_END,
                FSUtil.getFeature(da, FEATURE_BASE_NAME_END, Integer.class));
        aCas.removeFsFromIndexes(da);
    }
    else if (aCas.getDocumentText() != null) {
        FSUtil.setFeature(dmd, FEATURE_BASE_NAME_BEGIN, 0);
        FSUtil.setFeature(dmd, FEATURE_BASE_NAME_END, aCas.getDocumentText().length());
    }
    aCas.addFsToIndexes(dmd);
    return dmd;
}
 
Example #25
Source File: BratAnnotatorUtility.java    From webanno with Apache License 2.0 5 votes vote down vote up
public static CAS clearAnnotations(CAS aCas)
    throws IOException
{
    CAS target;
    try {
        target = CasFactory.createCas((TypeSystemDescription) null);
    }
    catch (UIMAException e) {
        throw new IOException(e);
    }
    
    // Copy the CAS - basically we do this just to keep the full type system information
    CASCompleteSerializer serializer = serializeCASComplete((CASImpl) getRealCas(aCas));
    deserializeCASComplete(serializer, (CASImpl) getRealCas(target));

    // Remove all annotations from the target CAS but we keep the type system!
    target.reset();
    
    // Copy over essential information
    if (exists(aCas, getType(aCas, DocumentMetaData.class))) {
        copyDocumentMetadata(aCas, target);
    }
    else {
        WebAnnoCasUtil.createDocumentMetadata(aCas);
    }
    target.setDocumentLanguage(aCas.getDocumentLanguage()); // DKPro Core Issue 435
    target.setDocumentText(aCas.getDocumentText());
    
    // Transfer token boundaries
    for (AnnotationFS t : selectTokens(aCas)) {
        target.addFsToIndexes(createToken(target, t.getBegin(), t.getEnd()));
    }

    // Transfer sentence boundaries
    for (AnnotationFS s : selectSentences(aCas)) {
        target.addFsToIndexes(createSentence(target, s.getBegin(), s.getEnd()));
    }

    return target;
}
 
Example #26
Source File: Tsv3XSerializerTest.java    From webanno with Apache License 2.0 5 votes vote down vote up
private JCas makeJCasOneSentence(String aText) throws UIMAException
{
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory
            .createTypeSystemDescriptionFromPath(
                    "src/test/resources/desc/type/webannoTestTypes.xml");
   
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    
    JCas jcas = JCasFactory.createJCas(merged);
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    
    TokenBuilder<Token, Sentence> tb = new TokenBuilder<>(Token.class,
            Sentence.class);
    tb.buildTokens(jcas, aText);
    
    // Remove the sentences generated by the token builder which treats the line break as a
    // sentence break
    for (Sentence s : select(jcas, Sentence.class)) {
        s.removeFromIndexes();
    }
    
    // Add a new sentence covering the whole text
    new Sentence(jcas, 0, jcas.getDocumentText().length()).addToIndexes();
    
    return jcas;
}
 
Example #27
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 5 votes vote down vote up
private static JCas makeJCas() throws UIMAException
{
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory
            .createTypeSystemDescriptionFromPath(
                    "src/test/resources/desc/type/webannoTestTypes.xml");
   
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    
    JCas jcas = JCasFactory.createJCas(merged);

    DocumentMetaData.create(jcas).setDocumentId("doc");
    
    return jcas;
}
 
Example #28
Source File: NYTEntitySalienceFeatureExtractor.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public List<EntityInstance> getEntityInstances(JCas jCas, TrainingSettings.FeatureExtractor featureExtractor) throws Exception {
  Collection<AidaEntity> aidaEntities = JCasUtil.select(jCas, AidaEntity.class);
  ListMultimap<String, AidaEntity> entitiesMentions = ArrayListMultimap.create();

  // Group by actual entity (uima.Entity is a mention).
  for (AidaEntity aidaEntity : aidaEntities) {
    entitiesMentions.put(aidaEntity.getID(), aidaEntity);
  }

  Logger logger = LoggerFactory.getLogger(NYTEntitySalienceFeatureExtractor.class);
  String docId = JCasUtil.selectSingle(jCas, DocumentMetaData.class).getDocumentId();
  logger.debug("[" + docId + "] AIDA entities: " + entitiesMentions.keySet());

  List<EntityInstance> entityInstances = new ArrayList<>(entitiesMentions.size());

  // Extract features for entities.
  for (Map.Entry<String, Collection<AidaEntity>> entry : entitiesMentions.asMap().entrySet()) {
    String entityId = entry.getKey();
    Collection<AidaEntity> entityMentions = entry.getValue();

    // Generate feature 8.
    Map<Integer, Double> entityFeatureValues = getEntityFeatureValues(jCas, entityMentions, featureExtractor);
    EntityInstance ei = new EntityInstance(entityId, entityFeatureValues);
    entityInstances.add(ei);
  }

  return entityInstances;
}
 
Example #29
Source File: SynchronizedTcuLookUpTable.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
private boolean isTheSameDocument(JCas aView) {
    DocumentMetaData meta = JCasUtil.selectSingle(aView,
            DocumentMetaData.class);
    String currentId = meta.getDocumentId();
    boolean isSame = currentId.equals(lastSeenDocumentIdTL.get());
    lastSeenDocumentIdTL.set(currentId);
    return isSame;
}
 
Example #30
Source File: LoadFactAnnotations.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
private void convert(CAS aCAS, int fact) throws CollectionException, SQLException {
  JCas jcas;
  try {
    jcas = aCAS.getJCas();
  } catch (CASException e) {
    throw new CollectionException(e);
  }
  JCasBuilder doc = new JCasBuilder(jcas);
  DocumentMetaData md = JCasUtil.selectSingle(jcas, DocumentMetaData.class);
  md.setDocumentId(Integer.toString(fact));
  doc.add("fact");
  doc.close();
}