Java Code Examples for org.apache.uima.fit.util.JCasUtil#selectCovered()

The following examples show how to use org.apache.uima.fit.util.JCasUtil#selectCovered() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SentenceCleaner.java    From newsleak with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * ICU sentence separator separates German (and other language?) constructions
 * with dates such as "25. Oktober". This function merges sentences falsely
 * separated at this date punctuation mark.
 *
 * @param jcas
 *            the jcas
 */
private void repairSentenceBreaks(JCas jcas) {

	Collection<Sentence> sentences = JCasUtil.select(jcas, Sentence.class);

	// merge falsely separated sentences
	List<Token> lastSentenceTokens = new ArrayList<Token>();
	Sentence lastSentence = null;
	for (Sentence sentence : sentences) {
		List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, sentence.getBegin(), sentence.getEnd());
		// check pattern
		if (monthNames.contains(tokens.get(0).getCoveredText()) && lastSentenceTokens.size() > 1
				&& lastSentenceTokens.get(lastSentenceTokens.size() - 2).getCoveredText().matches("\\d{1,2}")
				&& lastSentenceTokens.get(lastSentenceTokens.size() - 1).getCoveredText().matches("\\.")) {
			lastSentence.setEnd(sentence.getEnd());
			lastSentence.addToIndexes();
			sentence.removeFromIndexes();
		}
		lastSentenceTokens = tokens;
		lastSentence = sentence;
	}

}
 
Example 2
Source File: SentenceRelationshipAnnotator.java    From baleen with Apache License 2.0 6 votes vote down vote up
private Optional<List<WordToken>> getShortestPath(Offset offset, Entity source, Entity target) {
  if (dependencyGraph == null) {
    return Optional.empty();
  }

  List<WordToken> sources = JCasUtil.selectCovered(WordToken.class, source);
  List<WordToken> targets = JCasUtil.selectCovered(WordToken.class, target);
  List<WordToken> shortestPath =
      dependencyGraph.shortestPath(sources, targets, offset.getEnd() - offset.getBegin());

  if (shortestPath.isEmpty()) {
    return Optional.empty();
  }

  shortestPath.removeAll(sources);
  shortestPath.removeAll(targets);
  return Optional.of(shortestPath);
}
 
Example 3
Source File: MaltParser.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {

  for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {

    final List<WordToken> wordTokens = JCasUtil.selectCovered(jCas, WordToken.class, sentence);

    final String[] tokens = new String[wordTokens.size()];

    int i = 0;
    processWordTokens(wordTokens, tokens, i);

    try {
      final ConcurrentDependencyGraph graph = model.parse(tokens);
      processDependencyNodes(jCas, wordTokens, graph);

    } catch (final Exception e) {
      throw new AnalysisEngineProcessException(e);
    }
  }
}
 
Example 4
Source File: DictionaryExtractor.java    From newsleak with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * Resets token boundaries for retrieved REGEX matches which span over
 * multiple tokens.
 *
 * @param jcas
 *            the jcas
 * @param termsToTokenList
 *            the terms to token list
 */
private void correctTokenBoundaries(JCas jcas, ArrayList<DictTerm> termsToTokenList) {
	for (DictTerm dictTerm : termsToTokenList) {
		// tokens
		Collection<Token> coveredTokens = JCasUtil.selectCovered(jcas, Token.class, dictTerm);
		if (coveredTokens.size() > 1) {
			Token newToken = new Token(jcas);
			boolean firstTok = true;
			for (Token t : coveredTokens) {
				if (firstTok) {
					newToken.setBegin(t.getBegin());
					newToken.setPos(t.getPos());
					firstTok = false;
				}
				newToken.setEnd(t.getEnd());
				t.removeFromIndexes();
			}
			newToken.addToIndexes();
		}
	}
}
 
Example 5
Source File: TemplateFieldToEntityAnnotator.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {
  Collection<TemplateRecord> records = JCasUtil.select(jCas, TemplateRecord.class);
  for (TemplateRecord record : records) {
    if (!StringUtils.equals(recordName, record.getName())
        || (!StringUtils.isEmpty(source) && !source.equalsIgnoreCase(record.getSource()))) {
      continue;
    }

    List<TemplateField> fields = JCasUtil.selectCovered(TemplateField.class, record);
    for (TemplateField field : fields) {
      if (!StringUtils.equals(fieldName, field.getName())) {
        continue;
      }
      try {
        createEntity(jCas, field);
      } catch (BaleenException e) {
        getMonitor()
            .warn("Failed to process entity for record " + recordName + " field " + fieldName, e);
      }
    }
  }
}
 
Example 6
Source File: TextBlock.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Helper function providing same functionality as JCasUtil.select
 *
 * @param <T> the generic type
 * @param type the type
 * @return the collection
 */
public <T extends Annotation> Collection<T> select(final Class<T> type) {
  if (isWholeDocument()) {
    return JCasUtil.select(jCas, type);
  } else {
    return JCasUtil.selectCovered(jCas, type, getBegin(), getEnd());
  }
}
 
Example 7
Source File: PosUimaTokenizer.java    From DataVec with Apache License 2.0 5 votes vote down vote up
public PosUimaTokenizer(String tokens, AnalysisEngine engine, Collection<String> allowedPosTags) {
    if (engine == null)
        PosUimaTokenizer.engine = engine;
    this.allowedPosTags = allowedPosTags;
    this.tokens = new ArrayList<>();
    try {
        if (cas == null)
            cas = engine.newCAS();

        cas.reset();
        cas.setDocumentText(tokens);
        PosUimaTokenizer.engine.process(cas);
        for (Sentence s : JCasUtil.select(cas.getJCas(), Sentence.class)) {
            for (Token t : JCasUtil.selectCovered(Token.class, s)) {
                //add NONE for each invalid token
                if (valid(t))
                    if (t.getLemma() != null)
                        this.tokens.add(t.getLemma());
                    else if (t.getStem() != null)
                        this.tokens.add(t.getStem());
                    else
                        this.tokens.add(t.getCoveredText());
                else
                    this.tokens.add("NONE");
            }
        }



    } catch (Exception e) {
        throw new RuntimeException(e);
    }

}
 
Example 8
Source File: PredictionsWriter.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
    public void process(CAS aCAS) throws AnalysisEngineProcessException {

        try {
            JCas jCas = aCAS.getJCas();

            Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
            for (Sentence sentence : sentences) {
                List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
                Class<? extends Annotation> goldAnnotation;
                Class<? extends Annotation> predAnnotation;
                if (knowNER) {
                    goldAnnotation = NamedEntity.class;
                    predAnnotation = TextClassificationOutcome.class;
                } else {
                    goldAnnotation = TextClassificationOutcome.class;
                    predAnnotation = NamedEntity.class;
                }

                for (Token token : tokens) {

                    String line = getTag(goldAnnotation, jCas, token) + "\t" + getTag(predAnnotation, jCas, token);
                    out.add(line);
//                    verboseOut.add(token.getCoveredText() + "\t" + line);

                }
//                verboseOut.add("\n");
                out.add("\n");
            }
//            System.out.println(out);
        }catch (CASException e) {
            e.printStackTrace();
        }
//        out.flush();
    }
 
Example 9
Source File: AnnotationUtils.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the first annotation which is covered (below) by the provided annotation.
 *
 * @param <T> the generic type of annotation
 * @param clazz the class fo the annotation
 * @param annotation the annotation instance which covers the desired
 * @return the covered annotation (as an optional)
 */
public static <T extends Annotation> Optional<T> getSingleCovered(
    final Class<T> clazz, final Annotation annotation) {
  final List<T> list = JCasUtil.selectCovered(clazz, annotation);
  if (list.isEmpty()) {
    return Optional.empty();
  } else {
    return Optional.of(list.get(0));
  }
}
 
Example 10
Source File: AbstractTemplateRecordConsumer.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
  Multimap<String, ExtractedRecord> records = HashMultimap.create();
  Collection<TemplateRecord> recordAnnotations = JCasUtil.select(jCas, TemplateRecord.class);

  HashSet<TemplateField> allFields = new HashSet<>(JCasUtil.select(jCas, TemplateField.class));

  for (TemplateRecord recordAnnotation : recordAnnotations) {
    Collection<TemplateField> fieldAnnotations =
        JCasUtil.selectCovered(TemplateField.class, recordAnnotation);
    List<TemplateField> recordFieldAnnotations =
        fieldAnnotations.stream()
            .filter(f -> recordAnnotation.getSource().equals(f.getSource()))
            .collect(Collectors.toList());
    allFields.removeAll(recordFieldAnnotations);
    Collection<ExtractedField> fieldValues = makeFieldValues(recordFieldAnnotations);
    records.put(
        recordAnnotation.getSource(),
        new ExtractedRecord(recordAnnotation.getName(), fieldValues));
  }

  Multimap<String, TemplateField> remainingFields = HashMultimap.create();
  for (TemplateField templateField : allFields) {
    remainingFields.put(templateField.getSource(), templateField);
  }

  for (String source : remainingFields.keySet()) {
    records.put(source, new ExtractedRecord(makeFieldValues(remainingFields.get(source))));
  }

  String documentSourceName = SourceUtils.getDocumentSourceBaseName(jCas);
  writeRecords(jCas, documentSourceName, records.asMap());
}
 
Example 11
Source File: BmeowTag.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
	 * This feature extractor returns all bmeow type tag of the current
	 * TextClassifiactionUnit
	 */
	public Set<Feature> extract(JCas view, TextClassificationTarget classificationUnit)
			throws TextClassificationException {
		Set<Feature> features = new HashSet<>();

		List<BmeowType> bmeowTypes = JCasUtil.selectCovered(view, BmeowType.class, classificationUnit);

		List<String> bmeowStrings = bmeowTypes.stream().map(BmeowType::getBmeowType).collect(Collectors.toList());
		for (String pair : allPairs) {
			if (bmeowStrings.contains(pair)) {
				features.add(new Feature(FEATURE_NAME + "_" + pair, true));
			} else {
				features.add(new Feature(FEATURE_NAME + "_" + pair, false));
			}
		}
		
		if(bmeowStrings.get(0).equals("OTHER")){
			features.add(new Feature(FEATURE_NAME + "_OTHER", true));
		} else {
			features.add(new Feature(FEATURE_NAME + "_OTHER", false));
		}
		
//		features.stream().forEach(f -> System.out
//				.println(classificationUnit.getCoveredText() + ": " + f.getName() + ", " + f.getValue()));

		return features;
	}
 
Example 12
Source File: PosSequenceMatch.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public Set<Feature> extract(JCas jCas, TextClassificationTarget unit) throws TextClassificationException {
	List<PosDictionaryMatch>  dictMatch = JCasUtil.selectCovered(jCas, PosDictionaryMatch.class, unit);
	if(dictMatch.size() != 0){
		return new Feature(FEATURE_NAME, true).asSet();
	}
	
	return new Feature(FEATURE_NAME, false).asSet();
}
 
Example 13
Source File: TreeParser.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
/**
 * Gets trees from text.
 * First a sentence segmenter is used to segment the training examples in to sentences.
 * Sentences are then turned in to trees and returned.
 * @param text the text to process
 * @return the list of trees
 * @throws Exception
 */
public List<Tree> getTrees(String text) throws Exception {
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence))
            tokens.add(t.getCoveredText());


        c2.setDocumentText(sentence.getCoveredText());
        tokenizer.process(c2);
        parser.process(c2);

        //build the tree based on this
        TopTreebankNode node = JCasUtil.selectSingle(c2.getJCas(), TopTreebankNode.class);
        log.info("Tree bank parse " + node.getTreebankParse());
        for (TreebankNode node2 : JCasUtil.select(c2.getJCas(), TreebankNode.class)) {
            log.info("Node val " + node2.getNodeValue() + " and label " + node2.getNodeType() + " and tags was "
                            + node2.getNodeTags());
        }

        ret.add(TreeFactory.buildTree(node));
        c2.reset();

    }

    pool.releaseCas(c);
    pool.releaseCas(c2);

    return ret;


}
 
Example 14
Source File: PatternExtractor.java    From baleen with Apache License 2.0 4 votes vote down vote up
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {

  final Set<WordToken> wordsCoveredByEntites =
      JCasUtil.indexCovered(jCas, Entity.class, WordToken.class).values().stream()
          .flatMap(l -> l.stream())
          .collect(Collectors.toSet());

  for (final Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {

    final List<Entity> entities = JCasUtil.selectCovered(jCas, Entity.class, sentence);

    final List<WordToken> words = JCasUtil.selectCovered(jCas, WordToken.class, sentence);

    // We discard any punctuation in our word list since this appears to be unpredictable
    // output from OPenNLP parsing and we just want to count word distance.
    // If we have "hello world" then we might can get "hello, world, " which variation POS
    // tags. This filter is a little bit of a mess as a result.
    final List<WordToken> wordIndexes =
        words.stream()
            .filter(
                w ->
                    Character.isAlphabetic(w.getPartOfSpeech().charAt(0))
                        && w.getCoveredText().length() > 1)
            .collect(Collectors.toList());

    // Find entities within (windowSize) words of one another

    final String text = jCas.getDocumentText();
    final String lowerText = text.toLowerCase();
    final List<PatternExtract> patterns = new ArrayList<PatternExtract>();
    for (int i = 0; i < entities.size(); i++) {
      for (int j = i + 1; j < entities.size(); j++) {
        addPattern(entities.get(i), entities.get(j), patterns);
      }
    }

    // Filter out patterns which are too far way
    // Filter out patterns which contain no, not or neither

    patterns.stream()
        .filter(
            p -> {
              final int count = countWordsBetween(p, wordIndexes);
              return count >= 0 && count < windowSize;
            })
        .filter(
            p -> {
              String covered = p.getCoveredText(lowerText);
              return !negationRegex.matcher(covered).find();
            })
        .forEach(
            p -> {
              // Remove any other entities from the pattern
              // Remove stop words from the pattern

              // TODO: I question this in the paper. Whilst it is true we don't want stop
              // words I think we want
              // to extract a phrase. Their example is "play a role" which becomes
              // "play,role"
              p.setWordTokens(
                  removeAdditionalWords(words, p, wordsCoveredByEntites)
                      .collect(Collectors.toList()));

              if (!p.isEmpty()) {
                outputPattern(jCas, p);
              }
            });
  }
}
 
Example 15
Source File: DeidAwareTermConsumer.java    From ctakes-docker with Apache License 2.0 4 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int cTakesSemantic,
                               final CollectionMap<TextSpan, Long, ? extends Collection<Long>> textSpanCuis,
                               final CollectionMap<Long, Concept, ? extends Collection<Concept>> cuiConcepts )
      throws AnalysisEngineProcessException {
  List<IdentifiedAnnotation> toRemove = new ArrayList<>();

  // Find the spans associated with de-id strings:
  String docText = jcas.getDocumentText();
  for(String phiString : phiArray){
      int searchInd=0;
      int startInd=0;
      int endInd;
      while((startInd = docText.indexOf(phiString, searchInd)) >= 0){
          endInd = startInd + phiString.length();
          for(IdentifiedAnnotation covered : JCasUtil.selectCovered(jcas, IdentifiedAnnotation.class, startInd, endInd)){
              toRemove.add(covered);
          }
          searchInd = startInd+1;
          //System.err.println("Found phi string " + phiString + " at index: " + startInd + " to: " + endInd);
      }
  }

  // Remove all those identified annotations that fall within de-id strings.
  for(IdentifiedAnnotation annot : toRemove){
      annot.removeFromIndexes();
  }
   // Collection of UmlsConcept objects
   final Collection<UmlsConcept> umlsConceptList = new ArrayList<>();
   try {
      for ( Map.Entry<TextSpan, ? extends Collection<Long>> spanCuis : textSpanCuis ) {
         umlsConceptList.clear();
         for ( Long cuiCode : spanCuis.getValue() ) {
            umlsConceptList.addAll(
                  createUmlsConcepts( jcas, codingScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
         }
         final FSArray conceptArr = new FSArray( jcas, umlsConceptList.size() );
         int arrIdx = 0;
         for ( UmlsConcept umlsConcept : umlsConceptList ) {
            conceptArr.set( arrIdx, umlsConcept );
            arrIdx++;
         }
         final IdentifiedAnnotation annotation = createSemanticAnnotation( jcas, cTakesSemantic );
         annotation.setTypeID( cTakesSemantic );
         annotation.setBegin( spanCuis.getKey().getStart() );
         annotation.setEnd( spanCuis.getKey().getEnd() );
         annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
         annotation.setOntologyConceptArr( conceptArr );
         annotation.addToIndexes();
      }
   } catch ( CASRuntimeException crtE ) {
      // What is really thrown?  The jcas "throwFeatMissing" is not a great help
      throw new AnalysisEngineProcessException( crtE );
   }
}
 
Example 16
Source File: Step0bTextSegmenterA.java    From argument-reasoning-comprehension-task with Apache License 2.0 4 votes vote down vote up
private static void annotate(StandaloneArgument argument, PrintWriter outputTXTFile,
        Map<String, List<List<String>>> segmentedEDUs)
        throws IOException
{
    try {
        // original jcas, for automatic segmenting with tokens and sentences
        JCas originalJCas = initializeJCas(argument);

        SimplePipeline.runPipeline(originalJCas, getPipeline());

        // for re-annotation with manual sentences, paragraph, and tokens
        JCas segmentedJCas = initializeJCas(argument);
        copyParagraphAndTokenAnnotations(originalJCas, segmentedJCas);

        // now for each sentence collect tokens and run EDU segmenter
        for (Sentence sentence : JCasUtil.select(originalJCas, Sentence.class)) {

            List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence);
            List<String> tokenWords = new ArrayList<>();

            for (Token token : tokens) {
                tokenWords.add(token.getCoveredText());
            }

            String text = StringUtils.join(tokenWords, " ");
            String sentenceID = argument.getId() + "_" + sentence.getBegin();

            // either output for external segmenting or annotate
            if (outputTXTFile != null && segmentedEDUs == null) {
                outputTXTFile.println(sentenceID + "\t" + text);
                System.out.println("Writing " + sentenceID);
            }
            else if (outputTXTFile == null && segmentedEDUs != null) {

                List<List<String>> collectedEDUs = segmentedEDUs.get(sentenceID);

                if (collectedEDUs == null) {
                    throw new IllegalStateException(
                            "Cannot find EDUs for sentence " + sentenceID);
                }

                reAnnotatedSentencesFromEDUs(segmentedJCas, collectedEDUs, tokenWords, tokens);
            }
            else {
                throw new IllegalStateException();
            }
        }

        // save back
        argument.setJCas(segmentedJCas);
    }
    catch (UIMAException | IOException e) {
        throw new IOException(e);
    }
}
 
Example 17
Source File: TreeParser.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
/**
 * Gets trees from text.
 * First a sentence segmenter is used to segment the training examples in to sentences.
 * Sentences are then turned in to trees and returned.
 *
 * This will also process sentences with the following label format:
 * <YOURLABEL> some text </YOURLABEL>
 *
 * This will allow you to iterate on and label sentences and label spans yourself.
 *
 * @param text the text to process
 * @param labels
 * @return the list of trees
 * @throws Exception
 */
public List<Tree> getTreesWithLabels(String text, List<String> labels) throws Exception {
    CAS c = pool.getCas();
    c.setDocumentText(text);
    tokenizer.process(c);
    List<String> lowerCaseLabels = new ArrayList<>();
    for (String s : labels)
        lowerCaseLabels.add(s.toLowerCase());
    labels = lowerCaseLabels;

    List<Tree> ret = new ArrayList<>();
    CAS c2 = pool.getCas();
    for (Sentence sentence : JCasUtil.select(c.getJCas(), Sentence.class)) {
        List<String> tokens = new ArrayList<>();
        for (Token t : JCasUtil.selectCovered(Token.class, sentence))
            tokens.add(t.getCoveredText());

        Pair<String, MultiDimensionalMap<Integer, Integer, String>> stringsWithLabels =
                        ContextLabelRetriever.stringWithLabels(sentence.getCoveredText(), tf);
        c2.setDocumentText(stringsWithLabels.getFirst());



        tokenizer.process(c2);
        parser.process(c2);

        //build the tree based on this
        //damn it
        List<TopTreebankNode> nodes = new ArrayList<>(JCasUtil.select(c2.getJCas(), TopTreebankNode.class));
        if (nodes.size() > 1) {
            log.warn("More than one top level node for a treebank parse. Only accepting first input node.");
        }

        else if (nodes.isEmpty()) {
            c2.reset();
            continue;
        }


        Collection<String> labels2 = stringsWithLabels.getSecond().values();
        Set<String> diff = SetUtils.difference(labels2, labels);
        if (!diff.isEmpty()) {
            log.warn("Found invalid sentence. Skipping");
            c2.reset();
            continue;

        }

        TopTreebankNode node = nodes.get(0);
        ret.add(TreeFactory.buildTree(node, stringsWithLabels, labels));
        c2.reset();

    }

    pool.releaseCas(c);
    pool.releaseCas(c2);

    return ret;


}
 
Example 18
Source File: TextLineWriter.java    From newsleak with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {

	String docText = jcas.getDocumentText();
	// Language
	String outputText = jcas.getDocumentLanguage() + "\t";

	// n sentencs
	Collection<Sentence> sentences = JCasUtil.selectCovered(jcas, Sentence.class, 0,
			jcas.getDocumentText().length());
	outputText += sentences.size() + "\t";

	// n tokens
	Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, 0, jcas.getDocumentText().length());
	outputText += tokens.size() + "\t";

	// pos
	String firstPOS = tokens.iterator().next().getPos();
	outputText += firstPOS + "\t";

	// text
	outputText += docText.replaceAll("\n", " ");

	// linewriter.append(outputText);

	Metadata metadata = (Metadata) jcas.getAnnotationIndex(Metadata.type).iterator().next();
	langStats.put(metadata.getDocId(), jcas.getDocumentLanguage());

	if (sampleIdHash.contains(metadata.getDocId())) {
		int i = 0;
		for (Sentence s : sentences) {
			i++;
			String sOut = metadata.getDocId() + "\t" + i + "\t";
			String tOut = "";
			for (Token t : JCasUtil.selectCovered(jcas, Token.class, s.getBegin(), s.getEnd())) {
				tOut += t.getCoveredText() + " ";
			}
			sOut += tOut.trim();
			linewriter.append(sOut);
		}
	}

}
 
Example 19
Source File: SegmenterICU.java    From newsleak with GNU Affero General Public License v3.0 4 votes vote down vote up
/**
 * Flag unlikely documents. Documents with a very low type token ratio are
 * assumed to be log files or other non-fulltext documents. These documents can
 * be excluded from the information extraction pipeline, if the flag
 * noFulltextDocument is set to true by this annotator.
 *
 * @param jcas
 *            the jcas
 */
private void flagDubiousParagraphs(JCas jcas) {

	Collection<Paragraph> paragraphs = JCasUtil.select(jcas, Paragraph.class);

	for (Paragraph paragraph : paragraphs) {

		boolean noFulltextParagraph = false;

		Collection<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, paragraph.getBegin(), paragraph.getEnd());

		if (tokens.size() > TTR_MIN_LENGTH) {

			// calculate type-token ratio
			int tokenCount = 0;
			HashSet<String> vocabulary = new HashSet<String>();
			for (Token token : tokens) {
				String word = token.getCoveredText();
				if (StringUtils.isNumeric(word)) {
					continue;
				}
				tokenCount++;
				if (!vocabulary.contains(word)) {
					vocabulary.add(word);
				}
			}

			double typeTokenRatio = vocabulary.size() / (double) tokenCount;

			// set flag for very low TTR
			if (typeTokenRatio < TTR_THRESHOLD) {
				noFulltextParagraph = true;
				String paragraphText = paragraph.getCoveredText();
				log.log(Level.FINEST, "Unlikely fulltext paragraph flagged:\n----------------------------\n"
						+ paragraphText.substring(0, Math.min(paragraphText.length(), 1000)));
			}

		}

		paragraph.setIsNotFulltext(noFulltextParagraph);
		paragraph.addToIndexes();

	}

}
 
Example 20
Source File: DependencyGraph.java    From baleen with Apache License 2.0 2 votes vote down vote up
/**
 * Get the head node for the given annotation.
 *
 * <p>This is the highest word token in the dependency graph covered by the annotation.
 *
 * @param annotation
 * @return the head node
 */
public Optional<WordToken> getHeadNode(Entity annotation) {
  List<WordToken> covered = JCasUtil.selectCovered(WordToken.class, annotation);
  return getHeadNode(covered);
}