Java Code Examples for org.apache.uima.jcas.JCas#setDocumentText()

The following examples show how to use org.apache.uima.jcas.JCas#setDocumentText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Annotator2.java    From uima-uimafit with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
  try {
    JCas sortedView = ViewCreatorAnnotator.createViewSafely(jCas, ViewNames.SORTED_VIEW);
    jCas = jCas.getView(CAS.NAME_DEFAULT_SOFA);
    String initialText = jCas.getDocumentText();
    char[] chars = initialText.toCharArray();
    Arrays.sort(chars);
    String sortedText = new String(chars).trim();
    sortedView.setDocumentText(sortedText);

    sortedView = ViewCreatorAnnotator.createViewSafely(jCas, ViewNames.SORTED_PARENTHESES_VIEW);
    JCas parenthesesView = jCas.getView(ViewNames.PARENTHESES_VIEW);
    String parenthesesText = parenthesesView.getDocumentText();
    chars = parenthesesText.toCharArray();
    Arrays.sort(chars);
    sortedText = new String(chars).trim();
    sortedView.setDocumentText(sortedText);

  } catch (CASException e) {
    throw new AnalysisEngineProcessException(e);
  }

}
 
Example 2
Source File: ConsumerUtilsTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void testEntityExternalId() throws UIMAException, BaleenException {
  JCas jCas = JCasSingleton.getJCasInstance();
  jCas.setDocumentText("Hello World");

  Person p1 = new Person(jCas);
  p1.setGender("female");
  p1.setValue("Jane Doe");
  p1.addToIndexes(jCas);

  Person p2 = new Person(jCas);
  p2.setGender("female");
  p2.setValue("J. Doe");
  p2.addToIndexes(jCas);

  assertEquals(
      ConsumerUtils.getExternalId(ImmutableSet.of(p1, p2)),
      ConsumerUtils.getExternalId(ImmutableSet.of(p1, p2)));
  assertEquals(
      "d3c514ea1fb3367430959255917ee4de12468004897d683d60114b475d37264a",
      ConsumerUtils.getExternalId(ImmutableSet.of(p1, p2)));

  assertNotEquals(
      ConsumerUtils.getExternalId(ImmutableSet.of(p1)),
      ConsumerUtils.getExternalId(ImmutableSet.of(p1, p2)));
}
 
Example 3
Source File: WebAnnoTsv3WriterTestBase.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Test
public void testTwoSentencesWithNoSpaceInBetween() throws Exception
{
    TypeSystemDescription global = TypeSystemDescriptionFactory.createTypeSystemDescription();
    TypeSystemDescription local = TypeSystemDescriptionFactory
            .createTypeSystemDescriptionFromPath(
                    "src/test/resources/desc/type/webannoTestTypes.xml");
   
    TypeSystemDescription merged = CasCreationUtils.mergeTypeSystems(asList(global, local));
    
    JCas jcas = JCasFactory.createJCas(merged);
    
    DocumentMetaData.create(jcas).setDocumentId("doc");
    jcas.setDocumentText("onetwo");
    new Token(jcas, 0, 3).addToIndexes();
    new Sentence(jcas, 0, 3).addToIndexes();
    new Token(jcas, 3, 6).addToIndexes();
    new Sentence(jcas, 3, 6).addToIndexes();
    
    writeAndAssertEquals(jcas);
}
 
Example 4
Source File: DocumentConverterTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
public void canConvertSentence() throws UIMAException {
  JCas jCas = JCasFactory.createJCas();
  jCas.setDocumentText("This is a test. This is another test.");

  String[] words = new String[] {"This", "is", "another", "test", "."};
  Sentence sentence2 =
      new Sentence(words, new int[] {16, 21, 24, 31, 35}, new int[] {20, 23, 30, 34, 36}, words);
  when(document.sentences()).thenReturn(new Sentence[] {sentence, sentence2});
  DocumentConverter converter = new DocumentConverter(jCas, document);
  converter.convert();

  Collection<uk.gov.dstl.baleen.types.language.Sentence> actual =
      JCasUtil.select(jCas, uk.gov.dstl.baleen.types.language.Sentence.class);
  assertEquals(2, actual.size());
  Iterator<uk.gov.dstl.baleen.types.language.Sentence> iterator = actual.iterator();

  uk.gov.dstl.baleen.types.language.Sentence next = iterator.next();
  assertEquals(0, next.getBegin());
  assertEquals(15, next.getEnd());
  next = iterator.next();
  assertEquals(16, next.getBegin());
  assertEquals(36, next.getEnd());
}
 
Example 5
Source File: PubmedArchiveCollectionReader2.java    From bluima with Apache License 2.0 6 votes vote down vote up
public void getNext(JCas jcas) throws IOException, CollectionException {

        MedlineCitation article = articlesIt.next();

        // text
        Abstract abstrct = article.getArticle().getAbstract();
        if (abstrct != null)
            jcas.setDocumentText(abstrct.getAbstractText());

        // add metadata
        String title = article.getArticle().getArticleTitle().getvalue();
        Header header = new Header(jcas);
        header.setDocId(article.getPMID().getvalue().toString());
        header.setTitle(title);
        // header.setSource(nextArticle.file);
        header.setComponentId(PubmedArchiveCollectionReader2.class.getName());
        header.addToIndexes();

        DateCreated dateCreated = article.getDateCreated(); //FIXME use dateCompleted
        Date pubDateA = new Date(jcas);
        pubDateA.setDay(parseInt(dateCreated.getDay().getvalue()));
        pubDateA.setMonth(parseInt(dateCreated.getMonth().getvalue()));
        pubDateA.setYear(parseInt(dateCreated.getYear().getvalue()));
        pubDateA.addToIndexes();
    }
 
Example 6
Source File: JCasDeserialiser.java    From baleen with Apache License 2.0 6 votes vote down vote up
/**
 * Deserialise the given JSON map by populating the given JCas.
 *
 * @param jCas to populate
 * @param input to deserialise
 * @throws IOException if there is an error while deserialising.
 */
public void deseralize(final JCas jCas, final Map<String, Object> input) {

  // Read top level
  jCas.setDocumentText((String) input.getOrDefault(JsonJCas.DOCUMENT_TEXT, ""));
  jCas.setDocumentLanguage((String) input.getOrDefault(JsonJCas.DOCUMENT_LANGUAGE, ""));

  // Read Document annotations
  final DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas);
  final Map<String, Object> daNode =
      (Map<String, Object>) input.get(JsonJCas.DOCUMENT_ANNOTATION);
  processDocumentAnnotation(jCas, documentAnnotation, daNode);

  final List<Map<String, Object>> annotationsNode =
      (List<Map<String, Object>>) input.get(JsonJCas.ANNOTATIONS);
  final List<ReferencedFeatures> featuresToDereference =
      processAnnotations(jCas, annotationsNode);

  // Here we need to do hydrate the references

  final Map<Long, BaleenAnnotation> annotationIndex = buildAnnotationIndex(jCas);
  featuresToDereference.forEach(r -> r.rehydrate(jCas, annotationIndex));
}
 
Example 7
Source File: RecordStructureManagerTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  JCas jCas = JCasSingleton.getJCasInstance();
  jCas.setDocumentText(TEXT);
  addAnnotations(jCas);

  recordStructureManager =
      new RecordStructureManager(
          StructureHierarchy.build(jCas, StructureUtil.getStructureClasses()));
}
 
Example 8
Source File: ComparableEntitySpanTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws UIMAException {
  JCas jCas = JCasSingleton.getJCasInstance();
  jCas.setDocumentText("Hello world");
  final Entity e = new Entity(jCas, 0, 5);

  final ComparableEntitySpan span = new ComparableEntitySpan(e, 0, 5);

  assertEquals(0, span.getBegin());
  assertEquals(5, span.getEnd());
  assertSame(e, span.getEntity());
  assertSame(e.getClass(), span.getClazz());

  assertEquals("Hello", span.getValue());
  e.setValue("Howdy");
  assertEquals("Howdy", span.getValue());

  final ComparableEntitySpan span2 = new ComparableEntitySpan(e, 0, 5);
  final ComparableEntitySpan span3 = new ComparableEntitySpan(e, 0, 6);
  final ComparableEntitySpan span4 = new ComparableEntitySpan(e, 1, 5);
  final ComparableEntitySpan span5 = new ComparableEntitySpan(new Person(jCas), 1, 5);

  assertEquals(span, span2);
  assertEquals(span.hashCode(), span2.hashCode());
  assertNotEquals(span, span3);
  assertNotEquals(span.hashCode(), span3.hashCode());
  assertNotEquals(span, span5);
  assertNotEquals(span.hashCode(), span5.hashCode());
  assertNotEquals(span, span4);
  assertNotEquals(span.hashCode(), span4.hashCode());

  assertEquals(span, span);
  assertNotEquals(span, null);
  assertNotEquals(span, "Hello");

  // Check doesn't error
  span.toString();
}
 
Example 9
Source File: Annotator1.java    From uima-uimafit with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
  try {
    JCas parentheticalView = ViewCreatorAnnotator.createViewSafely(jCas,
            ViewNames.PARENTHESES_VIEW);
    jCas = jCas.getView(CAS.NAME_DEFAULT_SOFA);
    String initialText = jCas.getDocumentText();
    String parentheticalText = initialText.replaceAll("[aeiou]+", "($0)");
    parentheticalView.setDocumentText(parentheticalText);
  } catch (CASException e) {
    throw new AnalysisEngineProcessException(e);
  }

}
 
Example 10
Source File: OffsetTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetOffsetText() throws UIMAException {
  JCas jCas = JCasFactory.createJCas();
  jCas.setDocumentText("This is a test.");

  assertEquals("", OffsetUtil.getText(jCas, new Offset(0, 0)));
  assertEquals("This", OffsetUtil.getText(jCas, new Offset(0, 4)));
  assertEquals(" is a ", OffsetUtil.getText(jCas, new Offset(4, 10)));
  assertEquals("This is a test.", OffsetUtil.getText(jCas, new Offset(0, 15)));
}
 
Example 11
Source File: ViewCopier.java    From biomedicus with Apache License 2.0 5 votes vote down vote up
@Override
public void migrate(JCas source, JCas target) {
  target.setDocumentText(source.getDocumentText());

  FeatureStructureCopyingQueue featureStructureCopyingQueue = new FeatureStructureCopyingQueue(
      source.getCas(),
      target.getCas());

  FSIterator<FeatureStructure> allFs = source.getIndexRepository()
      .getAllIndexedFS(source.getCasType(TOP.type));
  while (allFs.hasNext()) {
    featureStructureCopyingQueue.enqueue(allFs.next());
  }
  featureStructureCopyingQueue.run();
}
 
Example 12
Source File: MistAnalysisEngine.java    From ctakes-docker with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
  try{
    JCas deidView = CasUtil.getView(jCas.getCas(), DEID_VIEW_NAME, true).getJCas();

    copyDocIdToView(jCas, deidView);

    String text = forceXmlSerializable(jCas.getDocumentText().replace("<","&lt;").replace(">","&gt;"));

    String decoderOut = decoder.decodeString(text);

    while(true) {
      Matcher m = xmlPatt.matcher(decoderOut);
      if (!m.find()) {
        break;
      }
      String matchType = m.group(1);
      int matchStart = m.start();
      int matchEnd = m.end();
      decoderOut = decoderOut.substring(0, matchStart) + "[" + matchType + "]" + decoderOut.substring(matchEnd);
    }

    deidView.setDocumentText(decoderOut);
  }catch(Exception e){
    System.err.println("Error trying to run mist!");
    throw new AnalysisEngineProcessException(e);
  }
}
 
Example 13
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example 14
Source File: Tcf2DKPro.java    From inception with Apache License 2.0 5 votes vote down vote up
/**
 * This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation
 * layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as
 * special characters represented differently than in the original text.
 * <p>
 * If the CAS already contains a document text, it is kept.
 * <p>
 * If the CAS already contains a document language, it is kept.
 * 
 * @param aJCas
 *            the JCas.
 * @param aCorpusData
 *            the TCF document.
 */
public void convertText(JCas aJCas, TextCorpus aCorpusData)
{
    if (aJCas.getDocumentText() == null) {
        StringBuilder text = new StringBuilder();

        for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) {
            eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer()
                    .getToken(i);
            
            if (token.getStart() != null && token.getEnd() != null) {
                // Assuming all of the tokens have offset information...
                while (text.length() < token.getStart()) {
                    text.append(" ");
                }
            }
            else {
                // Assuming none of the tokens has offset information...
                if (i > 0) {
                    text.append(" ");
                }
            }
            
            text.append(token.getString());
        }
        aJCas.setDocumentText(text.toString());
    }
    
    aJCas.setDocumentLanguage(aCorpusData.getLanguage());
}
 
Example 15
Source File: RemoveDanglingRelationsRepairTest.java    From webanno with Apache License 2.0 5 votes vote down vote up
@Test
public void test()
    throws Exception
{
    JCas jcas = JCasFactory.createJCas();

    jcas.setDocumentText("This is a test.");
    
    Token span1 = new Token(jcas, 0, 4);
    span1.addToIndexes();
    
    Token span2 = new Token(jcas, 6, 8);
    
    Dependency dep = new Dependency(jcas, 0, 8);
    dep.setGovernor(span1);
    dep.setDependent(span2);
    dep.addToIndexes();
    
    List<LogMessage> messages = new ArrayList<>();
    CasDoctor cd = new CasDoctor(RemoveDanglingRelationsRepair.class,
            AllFeatureStructuresIndexedCheck.class);
    // A project is not required for this check
    boolean result = cd.analyze(null, jcas.getCas(), messages);
    // A project is not required for this repair
    cd.repair(null, jcas.getCas(), messages);
    
    assertFalse(result);
    
    messages.forEach(System.out::println);
}
 
Example 16
Source File: SimpleTextSegmenter.java    From uima-uimaj with Apache License 2.0 5 votes vote down vote up
public AbstractCas next() throws AnalysisEngineProcessException {
  int breakAt = mPos + mSegmentSize;
  if (breakAt > mDoc.length())
    breakAt = mDoc.length();
  // search for the next newline character. Note: this example segmenter implementation
  // assumes that the document contains many newlines. In the worst case, if this segmenter
  // is runon a document with no newlines, it will produce only one segment containing the
  // entire document text. A better implementation might specify a maximum segment size as
  // well as a minimum.
  while (breakAt < mDoc.length() && mDoc.charAt(breakAt - 1) != '\n')
    breakAt++;

  JCas jcas = getEmptyJCas();
  try {
    jcas.setDocumentText(mDoc.substring(mPos, breakAt));
    // if original CAS had SourceDocumentInformation, also add SourceDocumentInformatio
    // to each segment
    if (mDocUri != null) {
      SourceDocumentInformation sdi = new SourceDocumentInformation(jcas);
      sdi.setUri(mDocUri);
      sdi.setOffsetInSource(mPos);
      sdi.setDocumentSize(breakAt - mPos);
      sdi.addToIndexes();

      if (breakAt == mDoc.length()) {
        sdi.setLastSegment(true);
      }
    }

    mPos = breakAt;
    return jcas;
  } catch (Exception e) {
    jcas.release();
    throw new AnalysisEngineProcessException(e);
  }
}
 
Example 17
Source File: MboxReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
/** Process body of message as plain text */
private void processTextBody(JCas jCas, TextBody textBody) throws IOException {
  String text = CharStreams.toString(textBody.getReader());
  jCas.setDocumentText(text.trim());
}
 
Example 18
Source File: DummyCollectionReader.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
public void doGetNext(JCas jCas) throws IOException, CollectionException {
  jCas.setDocumentText(documents.remove(0));
}
 
Example 19
Source File: ConstraintsGeneratorTest.java    From webanno with Apache License 2.0 4 votes vote down vote up
@Test
public void testSimplePath()
    throws Exception
{
    ConstraintsGrammar parser = new ConstraintsGrammar(new FileInputStream(
            "src/test/resources/rules/10.rules"));
    Parse p = parser.Parse();

    ParsedConstraints constraints = p.accept(new ParserVisitor());

    JCas jcas = JCasFactory.createJCas();
    jcas.setDocumentText("The sun.");

    // Add token annotations
    Token t_the = new Token(jcas, 0, 3);
    t_the.addToIndexes();
    Token t_sun = new Token(jcas, 0, 3);
    t_sun.addToIndexes();

    // Add POS annotations and link them to the tokens
    POS p_the = new POS(jcas, t_the.getBegin(), t_the.getEnd());
    p_the.setPosValue("DET");
    p_the.addToIndexes();
    t_the.setPos(p_the);
    POS p_sun = new POS(jcas, t_sun.getBegin(), t_sun.getEnd());
    p_sun.setPosValue("NN");
    p_sun.addToIndexes();
    t_sun.setPos(p_sun);

    // Add dependency annotations
    Dependency dep_the_sun = new Dependency(jcas);
    dep_the_sun.setGovernor(t_sun);
    dep_the_sun.setDependent(t_the);
    dep_the_sun.setDependencyType("det");
    dep_the_sun.setBegin(dep_the_sun.getGovernor().getBegin());
    dep_the_sun.setEnd(dep_the_sun.getGovernor().getEnd());
    dep_the_sun.addToIndexes();

    Evaluator constraintsEvaluator = new ValuesGenerator();

    List<PossibleValue> possibleValues = constraintsEvaluator.generatePossibleValues(
            dep_the_sun, "DependencyType", constraints);

    List<PossibleValue> expectedOutput = new LinkedList<>();
    expectedOutput.add(new PossibleValue("det", false));

    assertEquals(expectedOutput, possibleValues);
}
 
Example 20
Source File: ProperNounInformationCollectorTest.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Test
public void testCanCollectInformation() throws UIMAException {
  JCas jCas = JCasFactory.createJCas();

  jCas.setDocumentText(
      "Sir John Major was Prime Minister of the United Kingdom. Major became Prime Minister after Thatcher resigned.");

  List<Sentence> s = Annotations.createSentences(jCas);

  WordToken wt1 = new WordToken(jCas);
  wt1.setBegin(4);
  wt1.setEnd(8);
  wt1.setPartOfSpeech("NNP");
  wt1.addToIndexes(jCas);

  WordToken wt2 = new WordToken(jCas);
  wt2.setBegin(9);
  wt2.setEnd(14);
  wt2.setPartOfSpeech("NNP");
  wt2.addToIndexes(jCas);

  WordToken wt3 = new WordToken(jCas);
  wt3.setBegin(19);
  wt3.setEnd(33);
  wt3.setPartOfSpeech("NN");
  wt3.addToIndexes(jCas);

  WordToken wt4 = new WordToken(jCas);
  wt4.setBegin(59);
  wt4.setEnd(64);
  wt4.setPartOfSpeech("NNP");
  wt4.addToIndexes(jCas);

  Person j1 = Annotations.createPerson(jCas, 0, 14, "Sir John Major");
  Person j2 = Annotations.createPerson(jCas, 19, 33, "Prime Minister");
  Person j3 = Annotations.createPerson(jCas, 59, 64, "Major");
  ReferenceTarget jRT = Annotations.createReferenceTarget(jCas, j1, j2, j3);

  ProperNounInformationCollector collector = new ProperNounInformationCollector();

  Set<EntityInformation<Person>> entityInformations =
      collector.getEntityInformation(jCas, Person.class);

  assertEquals(1, entityInformations.size());
  EntityInformation<Person> entityInformation = entityInformations.iterator().next();

  assertEquals(jRT, entityInformation.getReferenceTarget());
  assertTrue(
      CollectionUtils.isEqualCollection(
          ImmutableSet.of(j1, j3), entityInformation.getMentions()));
  assertTrue(CollectionUtils.isEqualCollection(s, entityInformation.getSentences()));
}