Java Code Examples for org.apache.uima.jcas.JCas#getDocumentText()

The following examples show how to use org.apache.uima.jcas.JCas#getDocumentText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StanfordTokenizer.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
  String text = aJCas.getDocumentText();
  Annotation document = new Annotation(text);
  StanfordCoreNLP stanfordCoreNLP;

  if(!languageMap.containsKey(aJCas.getDocumentLanguage())) {
    throw new AnalysisEngineProcessException(new LanguageNotSupportedException("Language Not Supported"));
  }

  stanfordCoreNLP = stanfordCoreNLPs[languageMap.get(aJCas.getDocumentLanguage())];

  stanfordCoreNLP.annotate(document);
  List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
  for (CoreMap sentence : sentences) {
    int sstart = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
    int ssend = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
    Sentence jsentence = new Sentence(aJCas, sstart, ssend);
    jsentence.addToIndexes();

    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
      Token casToken = new Token(aJCas, token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
      casToken.addToIndexes();
    }
  }
}
 
Example 2
Source File: DocumentTextWriter.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {

	String docId = BlueCasUtil.getHeaderDocId(jCas);

	if (jCas.getDocumentText() != null
			&& jCas.getDocumentText().length() > 0) {
		try {
			TextFileWriter writer = new TextFileWriter(outputDir
					+ File.separatorChar + docId + ".txt");
			writer.addLine(jCas.getDocumentText());
			writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}
 
Example 3
Source File: TildeTokenizer.java    From termsuite-core with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas cas) throws AnalysisEngineProcessException {
	try {
		String text = cas.getDocumentText();
		Scanner scanner = new Scanner(text);
		String delimiter = System.getProperty("line.separator");
		scanner.useDelimiter(delimiter);
		while (scanner.hasNext()) {
			String line = scanner.next();
			String[] items = line.split("\t");
			if (items.length == 4) {
				String word = items[0].trim();
				// String tag = items[1];
				String lemma = items[2].trim();
				String tag = items[3].trim();
				Token token = new Token(word, tag, lemma);
				this.getTokens().add(token);
			} 
		}
		scanner.close();
		this.cas = cas;
		this.enableHasNext(true);
	} catch (Exception e) {
		throw new AnalysisEngineProcessException(e);
	}
}
 
Example 4
Source File: PdfCollectionReaderTest.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Test
public void testAbbrevs() throws Exception {

    final String abbrevs[][] = { { "PMF", "MLT" }, {}, { "ICC", "HVA" } };

    for (JCas pdf : asList(createReader(PdfCollectionReader.class,
             PARAM_INPUT_DIRECTORY, "pdf", PARAM_EXTRACT_TABLES,
            true, PARAM_EXPAND_ABBREVIATIONS, true))) {

        int id = getHeaderIntDocId(pdf) - 1;
        String pdfText = pdf.getDocumentText();
        System.out.println(pdfText);
        for (String abbrev : abbrevs[id]) {
            assertTrue("all abbreviations '" + abbrev
                    + "' should be removed in:: " + pdfText,
                    pdfText.indexOf(abbrev) == -1);
        }
    }
}
 
Example 5
Source File: Html5.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void writeBody(JCas jCas, Element body) {
  // Entities
  Map<Integer, String> insertPositions = getEntityInsertPositions(jCas);

  Element div = body.appendElement("div");
  div.attr("style", "white-space: pre-line");

  String text = jCas.getDocumentText();
  Integer offset = 0;
  for (Entry<Integer, String> pos : insertPositions.entrySet()) {
    String insert = pos.getValue();
    text =
        text.substring(0, pos.getKey() + offset) + insert + text.substring(pos.getKey() + offset);
    offset += insert.length();
  }

  div.append(text);
}
 
Example 6
Source File: LucasHelperAnnotator.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {

	for (int i = 0; i < annots.size(); i++) {

		String insert = before + inserts[i].toUpperCase() + after;
		Collection<? extends DictTerm> selects = select(jcas, annots.get(i));
		for (DictTerm d : selects) {
			d.setAnnotType(insert);
			d.setDictCanon("⊂" + d.getDictCanon() + "⊃");
		}
		// add annot if 2 prots
		if (inserts[i].equals("molecule") && selects.size() > 1) {
			MultipleProteins mp = new MultipleProteins(jcas);
			mp.setPresent(1);
			mp.addToIndexes();
		}
	}

	String text = jcas.getDocumentText();
	DocumentTextHolder dth = new DocumentTextHolder(jcas, 0, text.length());
	dth.setText(text);
	dth.addToIndexes();
}
 
Example 7
Source File: Annotator2.java    From uima-uimafit with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
  try {
    JCas sortedView = ViewCreatorAnnotator.createViewSafely(jCas, ViewNames.SORTED_VIEW);
    jCas = jCas.getView(CAS.NAME_DEFAULT_SOFA);
    String initialText = jCas.getDocumentText();
    char[] chars = initialText.toCharArray();
    Arrays.sort(chars);
    String sortedText = new String(chars).trim();
    sortedView.setDocumentText(sortedText);

    sortedView = ViewCreatorAnnotator.createViewSafely(jCas, ViewNames.SORTED_PARENTHESES_VIEW);
    JCas parenthesesView = jCas.getView(ViewNames.PARENTHESES_VIEW);
    String parenthesesText = parenthesesView.getDocumentText();
    chars = parenthesesText.toCharArray();
    Arrays.sort(chars);
    sortedText = new String(chars).trim();
    sortedView.setDocumentText(sortedText);

  } catch (CASException e) {
    throw new AnalysisEngineProcessException(e);
  }

}
 
Example 8
Source File: DictionaryExtractor.java    From newsleak with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Annotate regex patterns (URLs, IPs, email addresses and Phone numbers)
 *
 * @param jcas
 *            the jcas
 * @param pattern
 *            the pattern
 * @param type
 *            the type
 * @return the array list
 */
public ArrayList<DictTerm> annotateRegex(JCas jcas, Pattern pattern, String type) {
	String docText = jcas.getDocumentText();
	ArrayList<DictTerm> regexMatches = new ArrayList<DictTerm>();
	Matcher matcher = pattern.matcher(docText);
	// Check all occurrences
	while (matcher.find()) {
		DictTerm dictTerm = new DictTerm(jcas);
		dictTerm.setBegin(matcher.start());
		dictTerm.setEnd(matcher.end());
		StringList typeList = new StringList(jcas);
		StringList baseFormList = new StringList(jcas);
		typeList = typeList.push(type);
		baseFormList = baseFormList.push(matcher.group());
		dictTerm.setDictType(typeList);
		dictTerm.setDictTerm(baseFormList);
		dictTerm.addToIndexes();
		regexMatches.add(dictTerm);
	}
	return regexMatches;
}
 
Example 9
Source File: UimaAcronymAnnotator.java    From uima-uimaj with Apache License 2.0 6 votes vote down vote up
/**
 * @see JCasAnnotator_ImplBase#process(JCas)
 */
public void process(JCas aJCas) {
  // go through document word-by-word
  String text = aJCas.getDocumentText();
  int pos = 0;
  StringTokenizer tokenizer = new StringTokenizer(text, " \t\n\r.<.>/?\";:[{]}\\|=+()!", true);
  while (tokenizer.hasMoreTokens()) {
    String token = tokenizer.nextToken();
    // look up token in map to see if it is an acronym
    String expandedForm = mMap.get(token);
    if (expandedForm != null) {
      // create annotation
      UimaAcronym annot = new UimaAcronym(aJCas, pos, pos + token.length(), expandedForm);
      annot.addToIndexes();
    }
    // incrememnt pos and go to next token
    pos += token.length();
  }
}
 
Example 10
Source File: FullDocument.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcess(JCas jCas) throws AnalysisEngineProcessException {
  String text = jCas.getDocumentText();

  if (text == null) {
    getMonitor().info("Didn't annotate the document as it contained no content");
    return;
  }

  try {
    Entity ret = et.getConstructor(JCas.class).newInstance(jCas);

    ret.setBegin(0);
    ret.setEnd(text.length());
    ret.setConfidence(1.0f);

    addToJCasIndex(ret);

    getMonitor().info("Annotated full document as {}", type);
  } catch (Exception e) {
    throw new AnalysisEngineProcessException(e);
  }
}
 
Example 11
Source File: Tcf2DKPro.java    From inception with Apache License 2.0 5 votes vote down vote up
/**
 * This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation
 * layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as
 * special characters represented differently than in the original text.
 * <p>
 * If the CAS already contains a document text, it is kept.
 * <p>
 * If the CAS already contains a document language, it is kept.
 * 
 * @param aJCas
 *            the JCas.
 * @param aCorpusData
 *            the TCF document.
 */
public void convertText(JCas aJCas, TextCorpus aCorpusData)
{
    if (aJCas.getDocumentText() == null) {
        StringBuilder text = new StringBuilder();

        for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) {
            eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer()
                    .getToken(i);
            
            if (token.getStart() != null && token.getEnd() != null) {
                // Assuming all of the tokens have offset information...
                while (text.length() < token.getStart()) {
                    text.append(" ");
                }
            }
            else {
                // Assuming none of the tokens has offset information...
                if (i > 0) {
                    text.append(" ");
                }
            }
            
            text.append(token.getString());
        }
        aJCas.setDocumentText(text.toString());
    }
    
    aJCas.setDocumentLanguage(aCorpusData.getLanguage());
}
 
Example 12
Source File: LineOrientedTextReader.java    From webanno with Apache License 2.0 5 votes vote down vote up
@Override
public void getNext(JCas aJCas)
    throws IOException, CollectionException
{
    Resource res = nextFile();
    initCas(aJCas, res);

    try (InputStream is = new BufferedInputStream(res.getInputStream())) {
        aJCas.setDocumentText(IOUtils.toString(is, "UTF-8"));
    }

    String t = aJCas.getDocumentText();
    int start = 0;
    int end = t.indexOf('\n');
    while (end >= 0) {
        createSentence(aJCas, start, end);
        start = end + 1;
        if (start < t.length()) {
            end = t.indexOf('\n', start);
        }
        else {
            end = -1;
        }
    }

    if (start < t.length()) {
        createSentence(aJCas, start, t.length());
    }
}
 
Example 13
Source File: XmiTest.java    From uima-uimafit with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
  String text = jCas.getDocumentText();
  for (int i = 0; i < text.length() - 3; i += 3) {
    new Token(jCas, i, i + 3).addToIndexes();
  }
}
 
Example 14
Source File: ReferencesFinderAnnotator.java    From bluima with Apache License 2.0 5 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    if (!isEmptyText(jCas)) {
        String text = jCas.getDocumentText();
        int pmid = getHeaderIntDocId(jCas);

        List<Section> sections = newArrayList();

        Matcher m = REFS.matcher(text);
        while (m.find()) {
            int end = jCas.getDocumentText().length();// m.end()
            // LATER section's end is set to document's end. This should be
            // improved, e.g. in the case of additional material located
            // after the reference section
            Section section = new Section(jCas, m.start(), end);
            section.setSectionType(BlueUima.SECTION_TYPE_REFERENCES);
            sections.add(section);
        }

        if (!sections.isEmpty()) {
            // add last occurrence
            sections.get(sections.size() - 1).addToIndexes();
            LOG.trace(pmid + "\t{}REFERENCES\t", sections.size());

        } else
            LOG.trace(pmid + "\tNO REFERENCES");
    }
}
 
Example 15
Source File: SectionBasedCoocConfidenceAnnotator.java    From bluima with Apache License 2.0 5 votes vote down vote up
private void printSectionStart(String desc, JCas cas, int begin) {
    String str = cas.getDocumentText();
    if (begin < 0) {
        System.out.println("AAAA - " + desc + " - -1 - " + BlueCasUtil.getHeaderDocId(cas));
    } else if (begin < str.length()) {
        int end = begin + 50;
        if (end > str.length()) {
            end = str.length() - 1;
        }

        System.out.println("AAAA - "+desc+" ("+ BlueCasUtil.getHeaderDocId(cas)+":"+begin+"): "+str.substring(begin, end));
    }
}
 
Example 16
Source File: BlueCasUtil.java    From bluima with Apache License 2.0 5 votes vote down vote up
/**
 * If this cas has no Sentence annotation, creates one with the whole cas
 * text
 */
public static void fixNoSentences(JCas jCas) {
    Collection<Sentence> sentences = select(jCas, Sentence.class);
    if (sentences.size() == 0) {
        String text = jCas.getDocumentText();
        Sentence sentence = new Sentence(jCas, 0, text.length());
        sentence.addToIndexes();
    }
}
 
Example 17
Source File: StatsTextAnnotator.java    From bluima with Apache License 2.0 5 votes vote down vote up
public void process(JCas jCas) throws AnalysisEngineProcessException {

        String text = jCas.getDocumentText();
        if (text == null || text.length() == 0) {
            emptyDocs++;
        } else {
            int l = (text.length() / 100) * 100;
            histogram.add(l);
        }
    }
 
Example 18
Source File: PatternExtractor.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {

  final Set<WordToken> wordsCoveredByEntites =
      JCasUtil.indexCovered(jCas, Entity.class, WordToken.class).values().stream()
          .flatMap(l -> l.stream())
          .collect(Collectors.toSet());

  for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {

    final List<Entity> entities = JCasUtil.selectCovered(jCas, Entity.class, sentence);

    final List<WordToken> words = JCasUtil.selectCovered(jCas, WordToken.class, sentence);

    // We discard any punctuation in our word list since this appears to be unpredictable
    // output from OPenNLP parsing and we just want to count word distance.
    // If we have "hello world" then we might can get "hello, world, " which variation POS
    // tags. This filter is a little bit of a mess as a result.
    final List<WordToken> wordIndexes =
        words.stream()
            .filter(
                w ->
                    Character.isAlphabetic(w.getPartOfSpeech().charAt(0))
                        && w.getCoveredText().length() > 1)
            .collect(Collectors.toList());

    // Find entities within (windowSize) words of one another

    final String text = jCas.getDocumentText();
    final String lowerText = text.toLowerCase();
    final List<PatternExtract> patterns = new ArrayList<PatternExtract>();
    for (int i = 0; i < entities.size(); i++) {
      for (int j = i + 1; j < entities.size(); j++) {
        addPattern(entities.get(i), entities.get(j), patterns);
      }
    }

    // Filter out patterns which are too far way
    // Filter out patterns which contain no, not or neither

    patterns.stream()
        .filter(
            p -> {
              final int count = countWordsBetween(p, wordIndexes);
              return count >= 0 && count < windowSize;
            })
        .filter(
            p -> {
              String covered = p.getCoveredText(lowerText);
              return !negationRegex.matcher(covered).find();
            })
        .forEach(
            p -> {
              // Remove any other entities from the pattern
              // Remove stop words from the pattern

              // TODO: I question this in the paper. Whilst it is true we don't want stop
              // words I think we want
              // to extract a phrase. Their example is "play a role" which becomes
              // "play,role"
              p.setWordTokens(
                  removeAdditionalWords(words, p, wordsCoveredByEntites)
                      .collect(Collectors.toList()));

              if (!p.isEmpty()) {
                outputPattern(jCas, p);
              }
            });
  }
}
 
Example 19
Source File: BannerMAnnotator.java    From bluima with Apache License 2.0 4 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {

    JCas view;
    try {
        view = jCas.createView(VIEW_ID);
    } catch (CASException e) {
        throw new AnalysisEngineProcessException(e);
    }

    /*
     * One can observe that Banner does not react well when
     * proteins co-occurs closely with measures. It results
     * in wrong tokenization of protein entity mentions
     * like 'ml leupeptin'. The approach used below is to
     * replace the already-annotated measure entity mentions
     * by spaces before feeding Banner with the text.
     */
    String newDocumentText = jCas.getDocumentText();
    for (Measure measure : select(jCas, Measure.class)) {


        if ((min(measure.getBegin(), measure.getEnd()) > 0)
                && (measure.getBegin() < newDocumentText.length())
                && (measure.getEnd() < newDocumentText.length())) {
            String beforeText = newDocumentText.substring(0,
                    measure.getBegin());
            String afterText = newDocumentText.substring(measure.getEnd());

            StringBuilder sb = new StringBuilder(beforeText);
            for (int i = 0; i < abs(measure.getEnd() - measure.getBegin()); i++) {
                sb.append(REPLACEMENT_CHAR);
            }
            sb.append(afterText);
            newDocumentText = sb.toString();
        }
    }

    view.setDocumentText(newDocumentText);

    sentenceAnnotationCopyEngine.process(jCas);
    bannerEngine.process(jCas);

    for (Protein protein : select(view, Protein.class)) {

        //If a max length is set we check that the text covering the entity mention
        //is not too big.
        if ((maxLength == -1) ||
                (protein.getEnd() - protein.getBegin() <= maxLength)) {
            Protein proteinCopy = new Protein(jCas, protein.getBegin(),
                    protein.getEnd());
            proteinCopy.setName(protein.getName());
            proteinCopy.addToIndexes();
        }
    }
}
 
Example 20
Source File: TextLineWriter.java    From newsleak with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {

	String docText = jcas.getDocumentText();
	// Language
	String outputText = jcas.getDocumentLanguage() + "\t";

	// n sentencs
	Collection<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, 0,
			jcas.getDocumentText().length());
	outputText += sentences.size() + "\t";

	// n tokens
	Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText().length());
	outputText += tokens.size() + "\t";

	// pos
	String firstPOS = tokens.iterator().next().getPos();
	outputText += firstPOS + "\t";

	// text
	outputText += docText.replaceAll("\n", " ");

	// linewriter.append(outputText);

	Metadata metadata = (Metadata) jcas.getAnnotationIndex(Metadata.type).iterator().next();
	langStats.put(metadata.getDocId(), jcas.getDocumentLanguage());

	if (sampleIdHash.contains(metadata.getDocId())) {
		int i = 0;
		for (Sentence s : sentences) {
			i++;
			String sOut = metadata.getDocId() + "\t" + i + "\t";
			String tOut = "";
			for (Token t : JCasUtil.selectCovered(jcas, Token.class, s.getBegin(), s.getEnd())) {
				tOut += t.getCoveredText() + " ";
			}
			sOut += tOut.trim();
			linewriter.append(sOut);
		}
	}

}