org.apache.tika.exception.TikaException Java Exaples

Source File: UnpackBuilder.java From kite with Apache License 2.0

6 votes

private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) {
  String name = entry.getName();
  if (archive.canReadEntryData(entry)) {
    Record entrydata = new Record(); // TODO: or pass myself?
    //Record entrydata = record.copy();
    
    // For detectors to work, we need a mark/reset supporting
    // InputStream, which ArchiveInputStream isn't, so wrap
    TemporaryResources tmp = new TemporaryResources();
    try {
      TikaInputStream tis = TikaInputStream.get(archive, tmp);
      return extractor.parseEmbedded(tis, entrydata, name, getChild());
    } finally {
      try {
        tmp.dispose();
      } catch (TikaException e) {
        LOG.warn("Cannot dispose of tmp Tika resources", e);
      }
    }
  } else {
    return false;
  } 
}

Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0

6 votes

@PostConstruct
public void init() throws IOException, SAXException, TikaException{
    setFieldName(tikaFieldName);

    // load tika configuration
    tikaConfig = new TikaConfig(this.getClass().getClassLoader()
                            .getResourceAsStream("tika-config.xml"));

    // load tesseract ocr configuration
    tesseractConfig = new TesseractOCRConfig();
    if (tesseractTimeout > 0) {
        tesseractConfig.setTimeout(tesseractTimeout);
    }

    // load image magick configuration -- used for tiff conversion
    imgConfig = new ImageMagickConfig();
    if (convertTimeout > 0) {
        imgConfig.setTimeout(convertTimeout);
    }

    parser = new AutoDetectParser(tikaConfig);
}

Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0

6 votes

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}

Source File: PrintStreamSpewerTest.java From extract with MIT License

6 votes

@Test
public void testWriteFromUTF16LE() throws IOException, TikaException {
	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final PrintStream printStream = new PrintStream(outputStream);
	final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());

	final byte[] buffer = new byte[] {(byte) 0xFF, (byte) 0xFE, 0x24, 0x00};
	final String name = "imaginary-file.txt";
	final InputStream inputStream = new ByteArrayInputStream(buffer);
	final ParsingReader reader = new ParsingReader(inputStream, name);

	spewer.outputMetadata(false);
	TikaDocument document = factory.create(name);
	document.setReader(reader);
	spewer.write(document);

	Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name()));
	Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2));
}

Source File: PrintStreamSpewerTest.java From extract with MIT License

6 votes

@Test
public void testWriteFromUTF16BE() throws IOException, TikaException {
	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final PrintStream printStream = new PrintStream(outputStream);
	final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());

	final byte[] buffer = new byte[] {(byte) 0xFE, (byte) 0xFF, 0x00, 0x24};
	final String name = "imaginary-file.txt";
	final InputStream inputStream = new ByteArrayInputStream(buffer);
	final ParsingReader reader = new ParsingReader(inputStream, name);

	spewer.outputMetadata(false);
	TikaDocument document = factory.create(name);
	document.setReader(reader);
	spewer.write(document);

	Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name()));
	Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2));
}

Source File: PrintStreamSpewerTest.java From extract with MIT License

6 votes

@Test
public void testWriteToUTF16LE() throws IOException, TikaException {
	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final PrintStream printStream = new PrintStream(outputStream);
	final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());

	// Declare file contents of a single dollar sign ($).
	final String buffer = "\u0024";
	final String name = "imaginary-file.txt";

	// Tika parsers always output UTF-8.
	final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8));
	final ParsingReader reader = new ParsingReader(inputStream, name);

	spewer.outputMetadata(false);
	spewer.setOutputEncoding(StandardCharsets.UTF_16LE);
	TikaDocument document = factory.create("test-file");
	document.setReader(reader);
	spewer.write(document);

	Assert.assertArrayEquals(new byte[] {0x24, 0x00, 0x0A, 0x00}, Arrays.copyOfRange(outputStream.toByteArray(),
			0, 4));
}

Source File: PrintStreamSpewerTest.java From extract with MIT License

6 votes

@Test
public void testWriteToUTF16BE() throws IOException, TikaException {
	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final PrintStream printStream = new PrintStream(outputStream);
	final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());

	// Declare file contents of a single dollar sign ($).
	final String buffer = "\u0024";
	final String name = "imaginary-file.txt";

	// Tika parsers always output UTF-8.
	final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8));
	final ParsingReader reader = new ParsingReader(inputStream, name);

	spewer.outputMetadata(false);
	spewer.setOutputEncoding(StandardCharsets.UTF_16BE);
	TikaDocument document = factory.create("test-file");
	document.setReader(reader);
	spewer.write(document);

	Assert.assertArrayEquals(new byte[] {0x00, 0x24, 0x00, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(),
			0, 4));
}

Source File: ExtractorTest.java From extract with MIT License

6 votes

@Test
public void testGarbage() throws Throwable {
	final Extractor extractor = new Extractor();

	TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/garbage.bin").getPath()));


	thrown.expect(IOException.class);
	thrown.expectMessage("");
	thrown.expectCause(new CauseMatcher(TikaException.class, "Parse error"));

	final int read;

	try (final Reader reader = tikaDocument.getReader()) {
		read = reader.read();
	} catch (IOException e) {
		Assert.assertEquals("application/octet-stream", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE));
		throw e;
	}

	Assert.fail(String.format("Read \"%d\" while expecting exception.", read));
}

Source File: TikaContentExtractor.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

6 votes

@Test
public void testEmptyDoc() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);

  TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
  expect(mockDocument.getRawData()).andReturn(null).anyTimes();

  replay();
  proc.processDocument(mockDocument);
}

Source File: TikaExtractor.java From ache with Apache License 2.0

6 votes

public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}

Source File: DirectoryManifest.java From genie with Apache License 2.0

6 votes

ManifestVisitor(
    final Path root,
    final ImmutableMap.Builder<String, ManifestEntry> builder,
    final boolean checksumFiles,
    final Filter filter
) throws IOException {
    this.root = root;
    this.builder = builder;
    this.checksumFiles = checksumFiles;
    this.filter = filter;
    this.metadata = new Metadata();
    try {
        this.tikaConfig = new TikaConfig();
    } catch (final TikaException te) {
        log.error("Unable to create Tika Configuration due to error", te);
        throw new IOException(te);
    }
}

Source File: NodeTika.java From node-tika with MIT License

6 votes

public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

public String extractContent(File file) throws JATEException {
	String content = "";
	if (file == null || !file.exists()) {
		throw new JATEException("File is not found!");
	}

	try {
		String contentType = Files.probeContentType(file.toPath());

		if (MediaType.TEXT_PLAIN.getBaseType().toString().equals(contentType)) {
			content = parseTXTToString(file);
		} else {
			content = tika.parseToString(file);
		}
	} catch (IOException e1) {
		throw new JATEException("I/O exception when detecting file type.");
	} catch (TikaException tikaEx) {
		throw new JATEException("Tika Content extraction exception: " + tikaEx.toString());
	}

	return content;
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

6 votes

@Test
public void testExceptionToIgnoreFromTika() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);

  TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
  expect(mockDocument.getRawData()).andThrow(new AccessControlException("Oh no you don't!"));

  replay();
  proc.processDocument(mockDocument);
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

6 votes

@Test(expected = RuntimeException.class)
public void testRandomException() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);

  TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
  expect(mockDocument.getRawData()).andThrow(new RuntimeException());

  replay();
  proc.processDocument(mockDocument);
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

6 votes

@Test
public void testXml() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);

  TikaProcessor proc = new TikaProcessor.Builder().named("foo").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
  //System.out.println(new String(new byte[] {32, 32, 32, 84, 104, 101, 32, 116, 105, 116, 108, 101, 32, 84, 104, 105, 115, 32, 105, 115}));
  expect(mockDocument.getRawData()).andReturn(XML.getBytes()).anyTimes();
  mockDocument.setRawData(aryEq("   The title This is".getBytes()));
  expect(mockDocument.put("X_Parsed_By", "org.apache.tika.parser.CompositeParser")).andReturn(true);
  expect(mockDocument.put("Content_Type", "application/xml")).andReturn(true);

  replay();
  proc.processDocument(mockDocument);
}

Source File: TikaAnalysis.java From tutorials with MIT License

5 votes

public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

5 votes

@Test(expected = TikaException.class)
public void testBadConfig() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG_BAD.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);
  replay();

  new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
}

Source File: TikaProcessorTest.java From jesterj with Apache License 2.0

5 votes

@Test
public void testBadDoc() throws ParserConfigurationException, IOException, SAXException, TikaException {
  DocumentBuilderFactory factory =
      DocumentBuilderFactory.newInstance();
  DocumentBuilder builder = factory.newDocumentBuilder();
  ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
  org.w3c.dom.Document doc = builder.parse(input);
  TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
      .configuredWith(doc)
      .build();
  expect(mockDocument.getRawData()).andReturn(XML_BROKEN.getBytes()).anyTimes();
  mockDocument.setStatus(Status.ERROR);
  replay();
  proc.processDocument(mockDocument);
}

Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0

5 votes

public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

5 votes

private String parseTXTToString(File file) throws IOException, TikaException {
	Metadata metadata = new Metadata();
	InputStream stream = TikaInputStream.get(file.toPath(), metadata);
	try {
		return parseTXTToString(stream, metadata);
	} finally {
		stream.close();
	}
}

Source File: TikaOfficeDetectParser.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

/**
 * @deprecated This method will be removed in Apache Tika 1.0.
 */
public void parse(InputStream stream,
      ContentHandler handler, Metadata metadata)
      throws IOException, SAXException, TikaException 
{
   parse(stream, handler, metadata, new ParseContext());
}

Source File: TikaOfficeDetectParser.java From alfresco-repository with GNU Lesser General Public License v3.0

5 votes

public void parse(InputStream stream,
      ContentHandler handler, Metadata metadata,
      ParseContext parseContext) throws IOException, SAXException,
      TikaException 
{
   byte[] initial4 = new byte[4];
   InputStream wrapped;
   // Preserve TikaInputStreams as TikaInputStreams as they require less memory to process
   if (stream.markSupported())
   {
      stream.mark(initial4.length);
      IOUtils.readFully(stream, initial4);
      stream.reset();
      wrapped = stream;
   }
   else
   {
      PushbackInputStream inp = new PushbackInputStream(stream, 4);
      IOUtils.readFully(inp, initial4);
      inp.unread(initial4);
      wrapped = inp;
   }
   
   // Which is it?
   if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] &&
      initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] &&
      initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] &&
      initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3])
   {
      ooxmlParser.parse(wrapped, handler, metadata, parseContext);
   }
   else
   {
      ole2Parser.parse(wrapped, handler, metadata, parseContext);
   }
}

Source File: NodeTika.java From node-tika with MIT License

5 votes

public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	// Append the charset if the content-type was determined.
	if (charset != null && !charset.isEmpty()) {
		return contentType + "; charset=" + charset;
	}

	return contentType;
}

Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0

5 votes

@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}

Source File: TikaUnitTest.java From tutorials with MIT License

5 votes

@Test
public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
    InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
    Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);

    assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
    assertEquals("Microsoft Office User", metadata.get("Author"));

    stream.close();
}

Source File: TikaUnitTest.java From tutorials with MIT License

5 votes

@Test
public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
    InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
    Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);

    assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
    assertEquals("Microsoft Office User", metadata.get("Author"));

    stream.close();
}

Source File: TikaUnitTest.java From tutorials with MIT License

5 votes

@Test
public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
    InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
    String content = TikaAnalysis.extractContentUsingFacade(stream);

    assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
    assertThat(content, containsString("detects and extracts metadata and text"));

    stream.close();
}

org.apache.tika.exception.TikaException Java Examples