org.apache.tika.exception.TikaException Java Examples
The following examples show how to use
org.apache.tika.exception.TikaException.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: UnpackBuilder.java From kite with Apache License 2.0 | 6 votes |
private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) { String name = entry.getName(); if (archive.canReadEntryData(entry)) { Record entrydata = new Record(); // TODO: or pass myself? //Record entrydata = record.copy(); // For detectors to work, we need a mark/reset supporting // InputStream, which ArchiveInputStream isn't, so wrap TemporaryResources tmp = new TemporaryResources(); try { TikaInputStream tis = TikaInputStream.get(archive, tmp); return extractor.parseEmbedded(tis, entrydata, name, getChild()); } finally { try { tmp.dispose(); } catch (TikaException e) { LOG.warn("Cannot dispose of tmp Tika resources", e); } } } else { return false; } }
Example #2
Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@PostConstruct public void init() throws IOException, SAXException, TikaException{ setFieldName(tikaFieldName); // load tika configuration tikaConfig = new TikaConfig(this.getClass().getClassLoader() .getResourceAsStream("tika-config.xml")); // load tesseract ocr configuration tesseractConfig = new TesseractOCRConfig(); if (tesseractTimeout > 0) { tesseractConfig.setTimeout(tesseractTimeout); } // load image magick configuration -- used for tiff conversion imgConfig = new ImageMagickConfig(); if (convertTimeout > 0) { imgConfig.setTimeout(convertTimeout); } parser = new AutoDetectParser(tikaConfig); }
Example #3
Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); FileOutputStream fos = null; TikaInputStream tis = null; try { int w = image.getWidth(null); int h = image.getHeight(null); BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); File file = tmp.createTemporaryFile(); fos = new FileOutputStream(file); ImageIO.write(bImage, "png", fos); tis = TikaInputStream.get(file); parse(tis, handler, metadata, context); } finally { tmp.dispose(); if (tis != null) tis.close(); if (fos != null) fos.close(); } }
Example #4
Source File: PrintStreamSpewerTest.java From extract with MIT License | 6 votes |
@Test public void testWriteFromUTF16LE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); final byte[] buffer = new byte[] {(byte) 0xFF, (byte) 0xFE, 0x24, 0x00}; final String name = "imaginary-file.txt"; final InputStream inputStream = new ByteArrayInputStream(buffer); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); TikaDocument document = factory.create(name); document.setReader(reader); spewer.write(document); Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name())); Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2)); }
Example #5
Source File: PrintStreamSpewerTest.java From extract with MIT License | 6 votes |
@Test public void testWriteFromUTF16BE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); final byte[] buffer = new byte[] {(byte) 0xFE, (byte) 0xFF, 0x00, 0x24}; final String name = "imaginary-file.txt"; final InputStream inputStream = new ByteArrayInputStream(buffer); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); TikaDocument document = factory.create(name); document.setReader(reader); spewer.write(document); Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name())); Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2)); }
Example #6
Source File: PrintStreamSpewerTest.java From extract with MIT License | 6 votes |
@Test public void testWriteToUTF16LE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); // Declare file contents of a single dollar sign ($). final String buffer = "\u0024"; final String name = "imaginary-file.txt"; // Tika parsers always output UTF-8. final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8)); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.setOutputEncoding(StandardCharsets.UTF_16LE); TikaDocument document = factory.create("test-file"); document.setReader(reader); spewer.write(document); Assert.assertArrayEquals(new byte[] {0x24, 0x00, 0x0A, 0x00}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 4)); }
Example #7
Source File: PrintStreamSpewerTest.java From extract with MIT License | 6 votes |
@Test public void testWriteToUTF16BE() throws IOException, TikaException { final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final PrintStream printStream = new PrintStream(outputStream); final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames()); // Declare file contents of a single dollar sign ($). final String buffer = "\u0024"; final String name = "imaginary-file.txt"; // Tika parsers always output UTF-8. final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8)); final ParsingReader reader = new ParsingReader(inputStream, name); spewer.outputMetadata(false); spewer.setOutputEncoding(StandardCharsets.UTF_16BE); TikaDocument document = factory.create("test-file"); document.setReader(reader); spewer.write(document); Assert.assertArrayEquals(new byte[] {0x00, 0x24, 0x00, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 4)); }
Example #8
Source File: ExtractorTest.java From extract with MIT License | 6 votes |
@Test public void testGarbage() throws Throwable { final Extractor extractor = new Extractor(); TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/garbage.bin").getPath())); thrown.expect(IOException.class); thrown.expectMessage(""); thrown.expectCause(new CauseMatcher(TikaException.class, "Parse error")); final int read; try (final Reader reader = tikaDocument.getReader()) { read = reader.read(); } catch (IOException e) { Assert.assertEquals("application/octet-stream", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE)); throw e; } Assert.fail(String.format("Read \"%d\" while expecting exception.", read)); }
Example #9
Source File: TikaContentExtractor.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } }
Example #10
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 6 votes |
@Test public void testEmptyDoc() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20) .configuredWith(doc) .build(); expect(mockDocument.getRawData()).andReturn(null).anyTimes(); replay(); proc.processDocument(mockDocument); }
Example #11
Source File: TikaExtractor.java From ache with Apache License 2.0 | 6 votes |
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
Example #12
Source File: DirectoryManifest.java From genie with Apache License 2.0 | 6 votes |
ManifestVisitor( final Path root, final ImmutableMap.Builder<String, ManifestEntry> builder, final boolean checksumFiles, final Filter filter ) throws IOException { this.root = root; this.builder = builder; this.checksumFiles = checksumFiles; this.filter = filter; this.metadata = new Metadata(); try { this.tikaConfig = new TikaConfig(); } catch (final TikaException te) { log.error("Unable to create Tika Configuration due to error", te); throw new IOException(te); } }
Example #13
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } return contentType; }
Example #14
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, txtParser); txtParser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
Example #15
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
public String extractContent(File file) throws JATEException { String content = ""; if (file == null || !file.exists()) { throw new JATEException("File is not found!"); } try { String contentType = Files.probeContentType(file.toPath()); if (MediaType.TEXT_PLAIN.getBaseType().toString().equals(contentType)) { content = parseTXTToString(file); } else { content = tika.parseToString(file); } } catch (IOException e1) { throw new JATEException("I/O exception when detecting file type."); } catch (TikaException tikaEx) { throw new JATEException("Tika Content extraction exception: " + tikaEx.toString()); } return content; }
Example #16
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 6 votes |
@Test public void testExceptionToIgnoreFromTika() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20) .configuredWith(doc) .build(); expect(mockDocument.getRawData()).andThrow(new AccessControlException("Oh no you don't!")); replay(); proc.processDocument(mockDocument); }
Example #17
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 6 votes |
@Test(expected = RuntimeException.class) public void testRandomException() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20) .configuredWith(doc) .build(); expect(mockDocument.getRawData()).andThrow(new RuntimeException()); replay(); proc.processDocument(mockDocument); }
Example #18
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 6 votes |
@Test public void testXml() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); TikaProcessor proc = new TikaProcessor.Builder().named("foo").truncatingTextTo(20) .configuredWith(doc) .build(); //System.out.println(new String(new byte[] {32, 32, 32, 84, 104, 101, 32, 116, 105, 116, 108, 101, 32, 84, 104, 105, 115, 32, 105, 115})); expect(mockDocument.getRawData()).andReturn(XML.getBytes()).anyTimes(); mockDocument.setRawData(aryEq(" The title This is".getBytes())); expect(mockDocument.put("X_Parsed_By", "org.apache.tika.parser.CompositeParser")).andReturn(true); expect(mockDocument.put("Content_Type", "application/xml")).andReturn(true); replay(); proc.processDocument(mockDocument); }
Example #19
Source File: TikaAnalysis.java From tutorials with MIT License | 5 votes |
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException { Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); parser.parse(stream, handler, metadata, context); return handler.toString(); }
Example #20
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 5 votes |
@Test(expected = TikaException.class) public void testBadConfig() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG_BAD.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); replay(); new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20) .configuredWith(doc) .build(); }
Example #21
Source File: TikaProcessorTest.java From jesterj with Apache License 2.0 | 5 votes |
@Test public void testBadDoc() throws ParserConfigurationException, IOException, SAXException, TikaException { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8")); org.w3c.dom.Document doc = builder.parse(input); TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20) .configuredWith(doc) .build(); expect(mockDocument.getRawData()).andReturn(XML_BROKEN.getBytes()).anyTimes(); mockDocument.setStatus(Status.ERROR); replay(); proc.processDocument(mockDocument); }
Example #22
Source File: JATEUtil.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
public static String parseToPlainText(InputStream fileStream) { BodyContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); String rawContent = ""; try { parser.parse(fileStream, handler, metadata); rawContent = handler.toString(); } catch (IOException | SAXException | TikaException e) { LOG.debug("Parsing Exception while extracting content from current file. " + e.toString()); } return rawContent; }
Example #23
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
private String parseTXTToString(File file) throws IOException, TikaException { Metadata metadata = new Metadata(); InputStream stream = TikaInputStream.get(file.toPath(), metadata); try { return parseTXTToString(stream, metadata); } finally { stream.close(); } }
Example #24
Source File: TikaOfficeDetectParser.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
/** * @deprecated This method will be removed in Apache Tika 1.0. */ public void parse(InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { parse(stream, handler, metadata, new ParseContext()); }
Example #25
Source File: TikaOfficeDetectParser.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { byte[] initial4 = new byte[4]; InputStream wrapped; // Preserve TikaInputStreams as TikaInputStreams as they require less memory to process if (stream.markSupported()) { stream.mark(initial4.length); IOUtils.readFully(stream, initial4); stream.reset(); wrapped = stream; } else { PushbackInputStream inp = new PushbackInputStream(stream, 4); IOUtils.readFully(inp, initial4); inp.unread(initial4); wrapped = inp; } // Which is it? if(initial4[0] == POIFSConstants.OOXML_FILE_HEADER[0] && initial4[1] == POIFSConstants.OOXML_FILE_HEADER[1] && initial4[2] == POIFSConstants.OOXML_FILE_HEADER[2] && initial4[3] == POIFSConstants.OOXML_FILE_HEADER[3]) { ooxmlParser.parse(wrapped, handler, metadata, parseContext); } else { ole2Parser.parse(wrapped, handler, metadata, parseContext); } }
Example #26
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); // Use metadata to provide type-hinting to the AutoDetectReader. fillMetadata(metadata, contentType, uri); // Detect the character set. final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata); String charset = reader.getCharset().toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } // Append the charset if the content-type was determined. if (charset != null && !charset.isEmpty()) { return contentType + "; charset=" + charset; } return contentType; }
Example #27
Source File: TikaAutoInterpreter.java From db with GNU Affero General Public License v3.0 | 5 votes |
@Override public JSONObject toJson(String filePath) throws OperationException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = new FileInputStream(new File(filePath))) { parser.parse(stream, handler, metadata); } catch (IOException | SAXException | TikaException e) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading"); } final String fileText = handler.toString(); if(fileText == null || fileText.isEmpty()) { throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document"); } JSONObject jsonObject = new JSONObject(); jsonObject.put("_txt", fileText); String[] metadataNames = metadata.names(); for(String name : metadataNames) { jsonObject.put(name, metadata.get(name)); } return jsonObject; }
Example #28
Source File: TikaUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException { InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx"); Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream); assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By")); assertEquals("Microsoft Office User", metadata.get("Author")); stream.close(); }
Example #29
Source File: TikaUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException { InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx"); Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream); assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By")); assertEquals("Microsoft Office User", metadata.get("Author")); stream.close(); }
Example #30
Source File: TikaUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException { InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx"); String content = TikaAnalysis.extractContentUsingFacade(stream); assertThat(content, containsString("Apache Tika - a content analysis toolkit")); assertThat(content, containsString("detects and extracts metadata and text")); stream.close(); }