Java Code Examples for org.apache.uima.jcas.JCas#setDocumentLanguage()

The following examples show how to use org.apache.uima.jcas.JCas#setDocumentLanguage() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NewsleakCsvStreamReader.java    From newsleak with GNU Affero General Public License v3.0 6 votes vote down vote up
public void getNext(CAS cas) throws IOException, CollectionException {
	currentRecord++;
	JCas jcas;
	try {
		jcas = cas.getJCas();
	} catch (CASException e) {
		throw new CollectionException(e);
	}

	// Set document data
	CSVRecord record = recordsIterator.next();
	String docId = record.get(0); // external document id from CSV file

	jcas.setDocumentText(cleanBodyText(record.get(1)));
	jcas.setDocumentLanguage(record.size() > 3 ? record.get(3) : defaultLanguage);

	// Set metadata
	Metadata metaCas = new Metadata(jcas);
	metaCas.setDocId(docId);
	metaCas.setTimestamp(record.get(2));
	metaCas.addToIndexes();

	// metadata
	// is assumed to be provided from external prcessing in a separate file

}
 
Example 2
Source File: Preprocessor.java    From termsuite-core with Apache License 2.0 6 votes vote down vote up
public static JCas toCas(Document doc, String documentText, int nbDocuments, long corpusSize) {
	JCas cas;
	try {
		cas = JCasFactory.createJCas();
		cas.setDocumentLanguage(doc.getLang().getCode());
		cas.setDocumentText(documentText);
		SourceDocumentInformation sdi = JCasUtils.initJCasSDI(
			cas, 
			doc.getLang().getCode(), 
			documentText, 
			doc.getUrl(),
			doc.getSize()
			);
		sdi.setCorpusSize(corpusSize);
		sdi.setNbDocuments(nbDocuments);
		return cas;
	} catch (UIMAException e) {
		throw new TermSuiteException(
				"Could not initialize JCas for document " + doc.getUrl(), 
				e);
	}
}
 
Example 3
Source File: BaleenScheduler.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public final void getNext(final JCas jCas) throws IOException, CollectionException {
  getMonitor().startFunction("getNext");
  MetricsFactory.getInstance()
      .getPipelineMetrics(monitor.getPipelineName())
      .startDocumentProcess();

  jCas.setDocumentText(JobSettings.class.getSimpleName());
  jCas.setDocumentLanguage("en");

  final JobSettings settings = new JobSettings(jCas);
  for (final Map.Entry<String, String> e : config.entrySet()) {
    settings.set(e.getKey(), e.getValue());
  }

  getMonitor().finishFunction("getNext");
}
 
Example 4
Source File: JCasDeserialiser.java    From baleen with Apache License 2.0 6 votes vote down vote up
/**
 * Deserialise the given JSON map by populating the given JCas.
 *
 * @param jCas to populate
 * @param input to deserialise
 * @throws IOException if there is an error while deserialising.
 */
public void deseralize(final JCas jCas, final Map<String, Object> input) {

  // Read top level
  jCas.setDocumentText((String) input.getOrDefault(JsonJCas.DOCUMENT_TEXT, ""));
  jCas.setDocumentLanguage((String) input.getOrDefault(JsonJCas.DOCUMENT_LANGUAGE, ""));

  // Read Document annotations
  final DocumentAnnotation documentAnnotation = UimaSupport.getDocumentAnnotation(jCas);
  final Map<String, Object> daNode =
      (Map<String, Object>) input.get(JsonJCas.DOCUMENT_ANNOTATION);
  processDocumentAnnotation(jCas, documentAnnotation, daNode);

  final List<Map<String, Object>> annotationsNode =
      (List<Map<String, Object>>) input.get(JsonJCas.ANNOTATIONS);
  final List<ReferencedFeatures> featuresToDereference =
      processAnnotations(jCas, annotationsNode);

  // Here we need to do hydrate the references

  final Map<Long, BaleenAnnotation> annotationIndex = buildAnnotationIndex(jCas);
  featuresToDereference.forEach(r -> r.rehydrate(jCas, annotationIndex));
}
 
Example 5
Source File: LanguageDetectionAnnotator.java    From bluima with Apache License 2.0 6 votes vote down vote up
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    String title = getTitle(jCas);
    String text = jCas.getDocumentText();

    // add title to text if too small
    if (text.length() < minTextLenght && title.length() > 0) {
        text = title + " " + text;
    }

    // only detect if text is long enough
    if (text != null && text.length() > minTextLenght) {

        // TODO maybe cut if text too long --> slower
        try {

            jCas.setDocumentLanguage(detect(text));

        } catch (LangDetectException e) {
            LOG.warn("error detecting language for {}, {}",
                    getHeaderDocId(jCas), e);
        }
    }
}
 
Example 6
Source File: Tcf2DKPro.java    From inception with Apache License 2.0 5 votes vote down vote up
/**
 * This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation
 * layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as
 * special characters represented differently than in the original text.
 * <p>
 * If the CAS already contains a document text, it is kept.
 * <p>
 * If the CAS already contains a document language, it is kept.
 * 
 * @param aJCas
 *            the JCas.
 * @param aCorpusData
 *            the TCF document.
 */
public void convertText(JCas aJCas, TextCorpus aCorpusData)
{
    if (aJCas.getDocumentText() == null) {
        StringBuilder text = new StringBuilder();

        for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) {
            eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer()
                    .getToken(i);
            
            if (token.getStart() != null && token.getEnd() != null) {
                // Assuming all of the tokens have offset information...
                while (text.length() < token.getStart()) {
                    text.append(" ");
                }
            }
            else {
                // Assuming none of the tokens has offset information...
                if (i > 0) {
                    text.append(" ");
                }
            }
            
            text.append(token.getString());
        }
        aJCas.setDocumentText(text.toString());
    }
    
    aJCas.setDocumentLanguage(aCorpusData.getLanguage());
}
 
Example 7
Source File: FileSystemCollectionReader.java    From uima-uimaj with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the next.
 *
 * @param aCAS the a CAS
 * @throws IOException Signals that an I/O exception has occurred.
 * @throws CollectionException the collection exception
 * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
 */
public void getNext(CAS aCAS) throws IOException, CollectionException {
  JCas jcas;
  try {
    jcas = aCAS.getJCas();
  } catch (CASException e) {
    throw new CollectionException(e);
  }

  // open input stream to file
  File file = (File) mFiles.get(mCurrentIndex++);
  String text = FileUtils.file2String(file, mEncoding);
    // put document in CAS
  jcas.setDocumentText(text);

  // set language if it was explicitly specified as a configuration parameter
  if (mLanguage != null) {
    jcas.setDocumentLanguage(mLanguage);
  }

  // Also store location of source document in CAS. This information is critical
  // if CAS Consumers will need to know where the original document contents are located.
  // For example, the Semantic Search CAS Indexer writes this information into the
  // search index that it creates, which allows applications that use the search index to
  // locate the documents that satisfy their semantic queries.
  SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
  srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
  srcDocInfo.setOffsetInSource(0);
  srcDocInfo.setDocumentSize((int) file.length());
  srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
  srcDocInfo.addToIndexes();
}
 
Example 8
Source File: NewsleakElasticsearchReader.java    From newsleak with GNU Affero General Public License v3.0 5 votes vote down vote up
public void getNext(CAS cas) throws IOException, CollectionException {
	JCas jcas;
	try {
		jcas = cas.getJCas();
	} catch (CASException e) {
		throw new CollectionException(e);
	}

	String docId = totalIdList.get(currentRecord);
	GetResponse response = client.prepareGet(esIndex, ElasticsearchDocumentWriter.ES_TYPE_DOCUMENT, docId)
			.setFields("Content", "Created").get();

	jcas.setDocumentText((String) response.getField("Content").getValue());
	jcas.setDocumentLanguage(language);

	// Set metadata
	Metadata metaCas = new Metadata(jcas);
	metaCas.setDocId(docId);
	String docDate = (String) response.getField("Created").getValue();
	metaCas.setTimestamp(docDate);
	metaCas.addToIndexes();

	// heideltime
	Dct dct = new Dct(jcas);
	dct.setValue(docDate);
	dct.addToIndexes();

	currentRecord++;

	logger.log(Level.FINEST, "Document ID: " + docId);
	logger.log(Level.FINEST, "Document Length: " + jcas.getDocumentText().length());
}
 
Example 9
Source File: UimaTests.java    From bluima with Apache License 2.0 5 votes vote down vote up
public static JCas getTestCas(String text) throws UIMAException {
    JCas jCas = JCasFactory.createJCas();
    TypeSystemDescription ts = TypeSystemDescriptionFactory
            .createTypeSystemDescription();
    jCas.setDocumentText(text);
    jCas.setDocumentLanguage("en");
    return jCas;
}
 
Example 10
Source File: OptimaizeLanguageDetectorAnalysisEngine.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override public void process(JCas jCas) throws AnalysisEngineProcessException {
  Language language;
  try {
    language = detector.detectLanguage(jCas.getDocumentText());
  } catch (AidaUnsupportedLanguageException e) {
    throw new AnalysisEngineProcessException(e);
  }
  jCas.setDocumentLanguage(language.toString());
}
 
Example 11
Source File: MucReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void apply(MucEntry entry, JCas jCas) {
  jCas.setDocumentLanguage("en");
  jCas.setDocumentText(entry.getText());

  UimaSupport.getDocumentAnnotation(jCas).setSourceUri(entry.getId());
}
 
Example 12
Source File: Document.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
public void addSettingstoJcas(JCas jcas) throws IOException {
  AidaDocumentSettings ads = new AidaDocumentSettings(jcas);
  if (this.getLanguage() != null) {
    if (jcas.getDocumentLanguage() != null && !jcas.getDocumentLanguage().equals("x-unspecified") && !jcas.getDocumentLanguage()
        .equals(this.getLanguage().toString())) {
      throw new IllegalArgumentException("Language in JCas and language in settings are different");
    }
    ads.setLanguage(this.getLanguage().toString());
    jcas.setDocumentLanguage(ads.getLanguage());
  }
  if (this.getDocChunkStrategy() != null) {
    ads.setDocChunkStrategy(this.getDocChunkStrategy().toString());
  }
  ads.setDocumentId(this.getDocumentId());
  if (ads.getDocumentInputFormat() != null) {
    ads.setDocumentInputFormat(this.getDocumentInputFormat().toString());
  }
  ads.setEncoding(this.getEncoding());
  if (disambiguationSettings != null) {
    disambiguationSettings.addToJCas(ads, jcas);
  }
  ads.addToIndexes();
  if (annotations != null) {
    annotations.addMentionsToJCas(jcas);
  }
  if (!exists(jcas, DocumentMetaData.class)) {
    DocumentMetaData md = new DocumentMetaData(jcas);
    md.setDocumentId(ads.getDocumentId());
    md.addToIndexes();
  }
}
 
Example 13
Source File: DocumentLanguage.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
  TextObject textObject = textObjectFactory.forText(aJCas.getDocumentText());
  Optional<LdLocale> lang = languageDetector.detect(textObject);

  if (lang.isPresent()) {
    aJCas.setDocumentLanguage(lang.get().getLanguage());
  }
}
 
Example 14
Source File: PreprocessorService.java    From termsuite-core with Apache License 2.0 5 votes vote down vote up
public JCas createCas(Document document, String documentText) throws UIMAException {
	JCas cas = JCasFactory.createJCas();
	cas.setDocumentLanguage(document.getLang().getCode());
	cas.setDocumentText(documentText);
	JCasUtils.initJCasSDI(
			cas, 
			document.getLang().getCode(), 
			documentText, 
			document.getUrl(),
			document.getSize());
	return cas;
}
 
Example 15
Source File: OptimaizeLanguageDetectorAnalysisEngine.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override public void process(JCas jCas) throws AnalysisEngineProcessException {
  Language language;
  try {
    language = detector.detectLanguage(jCas.getDocumentText());
  } catch (AidaUnsupportedLanguageException e) {
    throw new AnalysisEngineProcessException(e);
  }
  jCas.setDocumentLanguage(language.toString());
}
 
Example 16
Source File: CpePipelineTest.java    From uima-uimafit with Apache License 2.0 4 votes vote down vote up
@Override
public void process(JCas jCas)
	throws AnalysisEngineProcessException
{
	jCas.setDocumentLanguage(MARKER);
}
 
Example 17
Source File: OneDocPerLineReader2.java    From bluima with Apache License 2.0 4 votes vote down vote up
public void getNext(JCas jCas) throws IOException, CollectionException {

        String[] split = nextLine.split("\t");
        checkEquals(3, split.length, "pmid" + split[0]);

        String pmid = unescapeCsv(split[0]).replaceAll("\"", ""), title = unescapeCsv(split[1]), txt = unescapeCsv(split[2]);

        Header header = new Header(jCas);
        header.setDocId(pmid);
        header.setTitle(title);
        header.addToIndexes();

        jCas.setDocumentText(title + (title.endsWith(".") ? " " : ". ") + txt);

        jCas.setDocumentLanguage("en");
    }
 
Example 18
Source File: LanguageAnnotator.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    aJCas.setDocumentLanguage(language);
}
 
Example 19
Source File: WordPressXMLReader.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
private void parseSubDocument(JCas jcas) throws XMLStreamException, IOException, CollectionException {
  if (this.language != null) {
    jcas.setDocumentLanguage(this.language);
  }

  LinkedList openTagStack = new LinkedList();
  String docTag = this.seekSubDocumentRoot();
  StringBuilder documentText = new StringBuilder();
  String docId = null;

  String fileName;
  String docUri;
  while (this.xmlReader.hasNext() && this.xmlReader.getDepth() > 1) {
    String dotPlace;
    if (this.xmlReader.isStartElement()) {
      if (!this.xmlReader.getPrefix().isEmpty()) {
        fileName = this.xmlReader.getPrefix() + ":" + this.xmlReader.getName().getLocalPart();
      } else {
        fileName = this.xmlReader.getName().getLocalPart();
      }
      openTagStack.push(fileName);
      dotPlace = null;
      if (this.isDocIdElement(fileName) && this.docIdAttributeName != null) {
        dotPlace = this.xmlReader.getAttributeValue((String) null, this.docIdAttributeName);
      }

      this.xmlReader.next();
      docUri = this.collectText();
      if (docUri.length() > 0) {
        if (this.isDocIdElement(fileName) && this.docIdAttributeName == null) {
          dotPlace = docUri;
        }
        this.processText(jcas, fileName, docUri, documentText);
      }

      if (dotPlace != null) {
        if (docId != null) {
          throw new CollectionException("multiple_doc_id_error", new Object[] { this.docIdTag });
        }

        if (dotPlace.length() == 0) {
          throw new CollectionException("empty_doc_id_error", new Object[] { this.docIdTag });
        }

        docId = dotPlace;
      }
    } else if (this.xmlReader.isCharacters()) {
      fileName = (String) openTagStack.peek();
      dotPlace = this.collectText();
      if (dotPlace.length() != 0) {
        this.processText(jcas, fileName, dotPlace, documentText);
      }
    } else if (this.xmlReader.isEndElement()) {
      fileName = this.xmlReader.getName().getLocalPart();
      if (docTag.equals(fileName)) {
        this.xmlReader.nextTag();
        break;
      }
      openTagStack.poll();
      this.xmlReader.next();
    } else if (this.xmlReader.getEventType() == XMLStreamConstants.CDATA) {
      fileName = (String) openTagStack.peek();
      dotPlace = this.xmlReader.getText();
      if (dotPlace.length() != 0) {
        this.processText(jcas, fileName, dotPlace, documentText);
      }
      this.xmlReader.next();
    }
  }

  jcas.setDocumentText(documentText.toString());
  fileName = ((File) this.xmlFiles.get(this.currentParsedFile)).getName();
  int dotPlace1 = fileName.lastIndexOf(46);
  if (this.docIdTag != null) {
    if (docId == null) {
      throw new CollectionException("de.tudarmstadt.ukp.dkpro.core.io.xml.XmlReader_Messages", "missing_doc_id_error",
          new Object[] { this.docIdTag });
    }
  } else if (dotPlace1 >= 0) {
    docId = fileName.substring(0, dotPlace1) + "-" + this.iDoc;
  }

  docUri = ((File) this.xmlFiles.get(this.currentParsedFile)).toURI().toString();
  DocumentMetaData docMetaData = DocumentMetaData.create(jcas);
  docMetaData.setDocumentId(docId);
  docMetaData.setDocumentUri(docUri + "#" + docId);
  docMetaData.setCollectionId(this.collectionId);
}
 
Example 20
Source File: JSONReader.java    From ambiverse-nlu with Apache License 2.0 4 votes vote down vote up
@Override
public void getNext(CAS cas) throws IOException, CollectionException {
  Resource res = nextFile();
  String name = res.getPath();
  JsonNode json = objectMapper.readTree(res.getInputStream());

  String title = null;
  if(titleKey != null) {
    title = json.get(titleKey).asText();
  }
  String content = json.get(contentKey).asText();

  StringBuilder sb = new StringBuilder();
  if (title != null) {
    sb.append(title).append(System.lineSeparator()).append(System.lineSeparator());
  } else {
    logger.debug(res.getPath() + " does not have a title field.");
  }

  if (content != null) {
    sb.append(content);
  } else {
    logger.debug(res.getPath() + " does not have a content field.");
  }

  JCas jcas;
  try {
    jcas = cas.getJCas();
  }
  catch (CASException e) {
    throw new CollectionException(e);
  }

  // Set doc id.
  String id = null;
  if(idKey != null) {
    id = json.get(idKey).asText();
  }
  if(id == null) {
    id = name;
  }
  DocumentMetaData dmd = new DocumentMetaData(jcas);
  dmd.addToIndexes();
  dmd.setDocumentId(id);
  jcas.setDocumentLanguage(getLanguage());
  jcas.setDocumentText(sb.toString());
}