org.apache.uima.collection.CollectionException Java Examples

The following examples show how to use org.apache.uima.collection.CollectionException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FixedDelayTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("squid:S2925" /* sleep required for test */)
public void testDelay()
    throws CollectionException, IOException, ResourceInitializationException,
        InterruptedException {
  FixedDelay scheduler = create("period", "1");

  long start = System.currentTimeMillis();
  assertTrue(scheduler.hasNext());

  Thread.sleep(1000);

  assertTrue(scheduler.hasNext());

  Thread.sleep(1000);

  assertTrue(scheduler.hasNext());
  long end = System.currentTimeMillis();

  System.out.println(end - start);
  long diff = end - start;
  assertTrue(String.format("Diff was %d", diff), diff >= 3900 && diff <= 4100);
}
 
Example #2
Source File: ReNounSeedDocument.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  supplied = true;
  // @formatter:off
  jCas.setDocumentText(
      new StringBuilder()
          .append(SENTENCE_1)
          .append(SEP)
          .append(SENTENCE_2)
          .append(SEP)
          .append(SENTENCE_3)
          .append(SEP)
          .append(SENTENCE_4)
          .append(SEP)
          .append(SENTENCE_5)
          .append(SEP)
          .append(SENTENCE_6)
          .append(SEP)
          .append(SENTENCE_7)
          .append(SEP)
          .append(SENTENCE_8)
          .toString());
  // @formatter:on

}
 
Example #3
Source File: AbstractTermSuiteCollectionReader.java    From termsuite-core with Apache License 2.0 6 votes vote down vote up
protected void fillCas(CAS cas, File file) throws IOException, CollectionException {
	String uri = file.toURI().toString();
	SourceDocumentInformation sdi;
	try {
		sdi = new SourceDocumentInformation(cas.getJCas());
		sdi.setUri(uri);
		String text = getDocumentText(file.getAbsolutePath(), this.mEncoding);
		cas.setDocumentLanguage(mLanguage.getCode());
		cas.setDocumentText(preparator.prepare(text));
		sdi.setDocumentSize((int)file.length());
		sdi.setCumulatedDocumentSize(this.currentFileByteSize);
		sdi.setCorpusSize(this.totalFileByteSize);
		sdi.setBegin(0);
		sdi.setEnd(text.length());
		sdi.setOffsetInSource(0);
		sdi.setDocumentIndex(mCurrentIndex);
		sdi.setNbDocuments(this.mFiles.size());
		
		sdi.setLastSegment(mCurrentIndex == mFiles.size() - 1);
		sdi.addToIndexes();
	} catch (CASException e) {
		throw new CollectionException(e);
	}
}
 
Example #4
Source File: WebannoTsv2Reader.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Override
public void getNext(JCas aJCas)
    throws IOException, CollectionException
{
    Resource res = nextFile();
    initCas(aJCas, res);
    InputStream is = null;
    try {
        is = res.getInputStream();
        convertToCas(aJCas, is, encoding);
    }
    finally {
        closeQuietly(is);
    }

}
 
Example #5
Source File: ActiveMQReader.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
protected void doGetNext(final JCas jCas) throws IOException, CollectionException {
  final String source = String.join(".", activeMQ.getResourceName(), endpoint);

  try {
    final Message msg = consumer.receive();
    if (msg instanceof TextMessage) {
      final String text = ((TextMessage) msg).getText();
      final InputStream is = IOUtils.toInputStream(text, Charset.defaultCharset());
      extractContent(is, source, jCas);
    } else {
      throw new IOException(
          String.format(
              "Unexpected message type for message with id %s from source %s",
              msg.getJMSMessageID(), source));
    }
  } catch (final JMSException e) {
    throw new CollectionException(e);
  }
}
 
Example #6
Source File: WebannoTsv3Reader.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Override
public void getNext(JCas aJCas)
    throws IOException, CollectionException
{
    Resource res = nextFile();
    initCas(aJCas, res);
    InputStream is = null;
    try {
        is = res.getInputStream();
        convertToCas(aJCas, is, encoding);
    }
    finally {
        closeQuietly(is);
    }

}
 
Example #7
Source File: CompressedXmiReader.java    From argument-reasoning-comprehension-task with Apache License 2.0 6 votes vote down vote up
@Override
public void getNext(CAS aCAS)
        throws IOException, CollectionException
{
    // nextTarEntry cannot be null here!
    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
    int size = IOUtils.copy(tarArchiveInputStream, buffer);

    String entryName = nextTarEntry.getName();
    getLogger().debug("Loaded " + size + " bytes from " + entryName);

    // and move forward
    fastForwardToNextValidEntry();

    // and now create JCas
    InputStream inputStream = new ByteArrayInputStream(buffer.toByteArray());
    try {
        XmiCasDeserializer.deserialize(inputStream, aCAS, lenient);
    }
    catch (SAXException e) {
        throw new IOException(e);
    }
}
 
Example #8
Source File: WebannoTsv1Reader.java    From webanno with Apache License 2.0 6 votes vote down vote up
@Override
public void getNext(JCas aJCas)
    throws IOException, CollectionException
{
    Resource res = nextFile();
    initCas(aJCas, res);
    InputStream is = null;
    try {
        is = res.getInputStream();
        convertToCas(aJCas, is, encoding);
    }
    finally {
        closeQuietly(is);
    }

}
 
Example #9
Source File: JCasPoolIterable.java    From ambiverse-nlu with Apache License 2.0 6 votes vote down vote up
public boolean hasNext() {
  if (this.destroyed) {
    return false;
  } else {
    boolean error = true;

    boolean var3;
    try {
      boolean e = this.collectionReader.hasNext();
      error = false;
      var3 = e;
    } catch (CollectionException var8) {
      throw new IllegalStateException(var8);
    } catch (IOException var9) {
      throw new IllegalStateException(var9);
    } finally {
      if (error && this.selfDestroy) {
        this.destroy();
      }

    }

    return var3;
  }
}
 
Example #10
Source File: FixedRateTest.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("squid:S2925" /* sleep required for test */)
public void testDelay()
    throws CollectionException, IOException, ResourceInitializationException,
        InterruptedException {
  FixedRate scheduler = create("period", "1");

  long start = System.currentTimeMillis();
  assertTrue(scheduler.hasNext());

  Thread.sleep(1000);

  assertTrue(scheduler.hasNext());

  Thread.sleep(1000);

  assertTrue(scheduler.hasNext());
  long end = System.currentTimeMillis();

  long diff = end - start;
  assertTrue(String.format("Diff was %d", diff), diff >= 1900 && diff <= 2100);
}
 
Example #11
Source File: SqlDbCellReader.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public boolean doHasNext() throws IOException, CollectionException {
  if (currRow.isEmpty()) {
    try {
      if (!rsCurrTable.next()) {
        if (!getNextTable()) {
          return false;
        }
        rowId = 0;
        rsCurrTable.next();
      }

      rowId++;
      for (String col : columns) {
        currRow.put(col, rsCurrTable.getObject(col));
      }
    } catch (SQLException se) {
      throw new IOException(se);
    }
  }
  return !currRow.isEmpty();
}
 
Example #12
Source File: BaleenScheduler.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public final void getNext(final JCas jCas) throws IOException, CollectionException {
  getMonitor().startFunction("getNext");
  MetricsFactory.getInstance()
      .getPipelineMetrics(monitor.getPipelineName())
      .startDocumentProcess();

  jCas.setDocumentText(JobSettings.class.getSimpleName());
  jCas.setDocumentLanguage("en");

  final JobSettings settings = new JobSettings(jCas);
  for (final Map.Entry<String, String> e : config.entrySet()) {
    settings.set(e.getKey(), e.getValue());
  }

  getMonitor().finishFunction("getNext");
}
 
Example #13
Source File: SqlCellReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings(
    "squid:S2077" /* The value of col is read from the database column names and so should be safe to use in this context */)
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  if (colsToProcess.isEmpty()) {
    // Get next row
    currId = idsToProcess.remove(0);

    colsToProcess.addAll(allCols);
  }

  String col = colsToProcess.remove(0);

  String content;

  try (ResultSet rs =
      conn.prepareStatement(
              "SELECT `" + col + "` FROM `" + table + "` WHERE `" + idColumn + "` = " + currId)
          .executeQuery()) {
    if (rs.next()) {
      content = rs.getObject(col).toString();
    } else {
      throw new IOException("Unable to get cell content - query returned no results");
    }

  } catch (SQLException e) {
    throw new IOException("Unable to get cell content", e);
  }

  String sourceUrl = sqlConn.substring(5) + "." + table + "#" + currId + "." + col;

  extractContent(
      new ByteArrayInputStream(content.getBytes(Charset.defaultCharset())), sourceUrl, jCas);
}
 
Example #14
Source File: Conll2003AidaReader.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
/**
 * Read a single sentence.
 */
private List<String[]> readSentence()
        throws IOException, CollectionException {
    if (!reader.hasNextLine()) {
        return null;
    }
    List<String[]> words = new ArrayList<>();
    String line;
    while (reader.hasNextLine()) {
        line = reader.nextLine();
        if (line.contains("DOCSTART")) {
            if (isOneFile) {
                nextDocId = parseDocId(line);
                return null;
            } else {
                throw new RuntimeException("There are more than DOCSTART in one document!");
            }
        }
        if (StringUtils.isBlank(line)) {
            break; // End of sentence
        }
        String[] fields = line.split("\t");
        words.add(fields);

        if (sentenceEnd == SentenceEndType.DOT
                && ".".equals(fields[0]) && !"dummy".equals(fields[1])) {
            break;
        }
    }
    return words;
}
 
Example #15
Source File: SqlCellReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public boolean doHasNext() throws IOException, CollectionException {
  if (!colsToProcess.isEmpty()) return true;

  if (!idsToProcess.isEmpty()) return true;

  idsToProcess.addAll(getIds(currId));
  return !idsToProcess.isEmpty();
}
 
Example #16
Source File: BaleenCollectionReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Override of the UIMA hasNext() method with logic to continuously check for new documents until
 * one is found. This prevents the collection reader from exiting (unless asked to), and so
 * creates a persistent collection reader and pipeline.
 */
@Override
public final boolean hasNext() throws IOException, CollectionException {
  monitor.startFunctionTrace("hasNext");

  boolean next = doHasNext();

  monitor.finishFunctionTrace("hasNext");
  return next;
}
 
Example #17
Source File: NewsleakElasticsearchReader.java    From newsleak with GNU Affero General Public License v3.0 5 votes vote down vote up
public void getNext(CAS cas) throws IOException, CollectionException {
	JCas jcas;
	try {
		jcas = cas.getJCas();
	} catch (CASException e) {
		throw new CollectionException(e);
	}

	String docId = totalIdList.get(currentRecord);
	GetResponse response = client.prepareGet(esIndex, ElasticsearchDocumentWriter.ES_TYPE_DOCUMENT, docId)
			.setFields("Content", "Created").get();

	jcas.setDocumentText((String) response.getField("Content").getValue());
	jcas.setDocumentLanguage(language);

	// Set metadata
	Metadata metaCas = new Metadata(jcas);
	metaCas.setDocId(docId);
	String docDate = (String) response.getField("Created").getValue();
	metaCas.setTimestamp(docDate);
	metaCas.addToIndexes();

	// heideltime
	Dct dct = new Dct(jcas);
	dct.setValue(docDate);
	dct.addToIndexes();

	currentRecord++;

	logger.log(Level.FINEST, "Document ID: " + docId);
	logger.log(Level.FINEST, "Document Length: " + jcas.getDocumentText().length());
}
 
Example #18
Source File: HooverElasticsearchReader.java    From newsleak with GNU Affero General Public License v3.0 5 votes vote down vote up
public boolean hasNext() throws IOException, CollectionException {
	if (currentRecord < totalRecords) {
		currentRecord++;
		return true;
	} else {
		return false;
	}
}
 
Example #19
Source File: Conll2003ReaderTcBmeow.java    From ambiverse-nlu with Apache License 2.0 5 votes vote down vote up
@Override
public String getTextClassificationOutcome(JCas jcas, TextClassificationTarget unit) throws CollectionException {
	List<NamedEntity> neList = JCasUtil.selectCovering(jcas, NamedEntity.class, unit);
	StringBuffer outcome = new StringBuffer();
	if(neList.size() == 1){
		NamedEntity ne = neList.get(0);
		List<Token> tokens = JCasUtil.selectCovered(jcas, Token.class, ne);

		if(tokens.size() == 1){
			outcome.append("W-");
		} else {
			for (int i = 0; i < tokens.size(); i++) {
				if(tokens.get(i).getCoveredText().equals(unit.getCoveredText())
						&& tokens.get(i).getBegin() == unit.getBegin()){
					if(i == 0){
						outcome.append("B-");
					} else if(i < tokens.size() - 1){
						outcome.append("M-");
					} else {
						outcome.append("E-");
					}
				}
			}
		}
		outcome.append(ne.getValue());

	} else if(neList.size() == 0){
		outcome.append("OTH");
	} else {
		throw new CollectionException(
				new Throwable("Could not get unique NER annotation to be used as TC outome. List size: " + neList.size() + " " + unit.getCoveredText()));
	}

	return outcome.toString();
}
 
Example #20
Source File: OnceTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws CollectionException, IOException, ResourceInitializationException {
  Once once = create();

  assertTrue(once.hasNext());

  assertFalse(once.hasNext());
  assertFalse(once.hasNext());
}
 
Example #21
Source File: BaleenCollectionReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public final void getNext(JCas jCas) throws IOException, CollectionException {
  monitor.startFunction("getNext");
  MetricsFactory.getInstance()
      .getPipelineMetrics(monitor.getPipelineName())
      .startDocumentProcess();

  doGetNext(jCas);

  monitor.finishFunction("getNext");
  monitor.persistCounts();
}
 
Example #22
Source File: CsvFolderReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
/**
 * Every time doHasNext() is called, check the WatchService for new events and add all new events
 * to the queue. Then return true if there are files on the queue, or false otherwise.
 *
 * <p>If the event indicates that a file has been deleted, ensure it is removed from the queue.
 */
@Override
public boolean doHasNext() throws IOException, CollectionException {
  WatchKey key;
  while ((key = watcher.poll()) != null) {
    for (WatchEvent<?> event : key.pollEvents()) {
      processEvent(key, event);
      getMonitor().meter("events").mark();
    }

    key.reset();
  }

  return !currLines.isEmpty() || !queue.isEmpty();
}
 
Example #23
Source File: MongoReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  ObjectId id = queue.remove(0);

  Document docIdField = new Document(idField, id);
  Document document = coll.find(docIdField).first();

  if (document == null) {
    getMonitor().error("No document returned from Mongo");
    throw new CollectionException();
  }

  String content = (String) document.get(contentField);

  InputStream is = IOUtils.toInputStream(content, Charset.defaultCharset());

  extractContent(is, mongo.getMongoURI() + "." + collection + "#" + id, jCas);

  for (Entry<String, Object> entry : document.entrySet()) {
    String key = entry.getKey();
    if (contentField.equals(key) || idField.equals(key)) {
      continue;
    } else {
      processMongoMetadataField(jCas, key, entry.getValue());
    }
  }

  if (deleteSource) {
    coll.deleteOne(docIdField);
  }
}
 
Example #24
Source File: SqlDbCellReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  String key = currRow.keySet().iterator().next();

  Object o = currRow.remove(key);

  String sourceUrl = sqlConn.substring(5) + "." + currTable + "#" + rowId + "." + key;
  extractContent(
      new ByteArrayInputStream(o.toString().getBytes(Charset.defaultCharset())), sourceUrl, jCas);
}
 
Example #25
Source File: FolderReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doGetNext(JCas jCas) throws IOException, CollectionException {
  if (queue.isEmpty()) {
    getMonitor().error("No documents on the queue - this method should not have been called");
    throw new CollectionException();
  }
  Path path = queue.remove(0);
  getMonitor().info("Processing file {}", path.toString());
  try (InputStream is = new FileInputStream(path.toFile()); ) {
    extractContent(is, path.toString(), jCas);
  }
}
 
Example #26
Source File: MboxReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  if (!attachments.isEmpty()) {
    // If we have attachments, first process those
    Map.Entry<String, Body> entry = attachments.firstEntry();
    getMonitor().info("Processing attachment {}", entry.getKey());

    processBody(jCas, entry.getValue(), entry.getKey());

    attachments.remove(entry.getKey());
  } else {
    // No attachments so process the next message
    String raw = mboxIterator.next().toString();
    count++;

    String uri = "mbox://" + mbox + "#" + count;
    getMonitor().info("Processing message {}", uri);

    // Parse message and get body
    Message msg = messageBuilder.parseMessage(new ByteArrayInputStream(raw.getBytes(charset)));
    Body body = msg.getBody();

    boolean doneBody = false;

    // Decide how to process body of message
    if (body instanceof SingleBody) {
      doneBody = processBody(jCas, body, uri);
    } else if (body instanceof Multipart) {
      Multipart mp = (Multipart) body;
      doneBody = processMultipart(jCas, mp, uri);
    }

    // No body found (just attachments? Or invalid message?)
    if (!doneBody) {
      throw new IOException("No processable body found");
    }
  }
}
 
Example #27
Source File: StreamingCollectionReader.java    From termsuite-core with Apache License 2.0 5 votes vote down vote up
@Override
public boolean hasNext() throws IOException, CollectionException {
	try {
		if(documentQueue.isEmpty())
			logger.info("Waiting for a new document.");
		currentDoc = documentQueue.take();
		if(currentDoc == CollectionDocument.LAST_DOCUMENT)
			return false;
		else
			return true;
	} catch (InterruptedException e) {
		logger.info("Stream {} interrupted", this.streamName);
		return false;
	}
}
 
Example #28
Source File: LineReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
protected void doGetNext(JCas jCas) throws IOException, CollectionException {
  InputStream is = IOUtils.toInputStream(line, Charset.defaultCharset());
  extractContent(is, file.getPath() + "#" + lineNumber, jCas);

  Metadata md = new Metadata(jCas);
  md.setKey("lineNumber");
  md.setValue(lineNumber.toString());
  getSupport().add(md);
}
 
Example #29
Source File: SqlRowReader.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public boolean doHasNext() throws IOException, CollectionException {
  if (!idsToProcess.isEmpty()) return true;

  idsToProcess.addAll(getIds(currId));
  return !idsToProcess.isEmpty();
}
 
Example #30
Source File: AbstractStreamCollectionReaderTest.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Test
public void testMax() throws ResourceInitializationException, CollectionException, IOException {
  FakeStreamCollectionReader r = new FakeStreamCollectionReader();
  r.setMaxDocuments(2);
  r.doInitialize(null);

  assertTrue(r.doHasNext());
  r.doGetNext(null);
  assertTrue(r.doHasNext());
  r.doGetNext(null);
  assertFalse(r.doHasNext());
}