org.apache.tika.parser.ParseContext Java Exaples

Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0

6 votes

private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}

Source File: AlterPDFParser.java From tika-server with Apache License 2.0

6 votes

private void callPDF2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config, boolean noOCR) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    Class c = Class.forName("org.apache.tika.parser.pdf.PDF2XHTML");
    Method m = c.getDeclaredMethod("process", PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);


    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();

    config.setOcrStrategy(noOCR ? PDFParserConfig.OCR_STRATEGY.NO_OCR
            : PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION);

    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
}

Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0

6 votes

public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

6 votes

@Test
public void offersNoTypesIfNotFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType pdf = MediaType.application("pdf");

    // With an invalid path, will offer no types
    ImageMagickConfig invalidConfig = new ImageMagickConfig();
    invalidConfig.setImageMagickPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(ImageMagickConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}

Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0

6 votes

@Test
public void offersTypesIfFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    //DefaultParser defaultParser = new DefaultParser();

    ParseContext parseContext = new ParseContext();
    MediaType pdf = MediaType.application("pdf");

    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
    assumeTrue(canRun());

    assertEquals(1, parser.getSupportedTypes(parseContext).size());
    assertTrue(parser.getSupportedTypes(parseContext).contains(pdf));

    // DefaultParser will not select the PDFPreprocessorParser, unless configured in tika config
    //assertEquals(PDFPreprocessorParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}

Source File: TikaCallable.java From flink-crawler with Apache License 2.0

6 votes

/**
 * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the
 * default set.
 * 
 * @return
 */
private ParseContext makeParseContext() {
    ParseContext result = new ParseContext();

    Set<String> validTags = _linkExtractor.getLinkTags();
    HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
    for (String tag : validTags) {
        if (defaultMapper.mapSafeElement(tag) == null) {
            result.set(HtmlMapper.class,
                    new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
            break;
        }
    }

    return result;
}

Source File: NodeTika.java From node-tika with MIT License

6 votes

private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) {
	final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

	if (options == null) {

		// Disable OCR and return if no options are specified.
		disableOcr(ocrConfig);
		parseContext.set(TesseractOCRConfig.class, ocrConfig);

		return;
	}

	fillOcrOptions(ocrConfig, options);
	parseContext.set(TesseractOCRConfig.class, ocrConfig);

	final PDFParserConfig pdfParserConfig = new PDFParserConfig();
	fillPdfOptions(pdfParserConfig, options);
	parseContext.set(PDFParserConfig.class, pdfParserConfig);

	// Allow a password to be specified for encrypted files.
	fillPassword(parseContext, options);
}

Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0

6 votes

/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}

Source File: TikaContentExtractor.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}

Source File: ImageConverter.java From openmeetings with Apache License 2.0

6 votes

private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}

Source File: TikaExtractor.java From ache with Apache License 2.0

6 votes

public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}

Source File: AlterPDFParser.java From tika-server with Apache License 2.0

5 votes

private void extractAndCheckMetadata(Metadata metadata, ParseContext context, PDFParserConfig localConfig, PDDocument pdfDocument)
        throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, AccessPermissionException {
    metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
    metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
    callExtractMetadata(pdfDocument, metadata, context);

    AccessChecker checker = localConfig.getAccessChecker();
    checker.check(metadata);
}

Source File: HtmlSchemaParser.java From data-prep with Apache License 2.0

5 votes

/**
 * @see SchemaParser#parse(Request)
 */
@Override
public Schema parse(Request request) {

    try {
        SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();

        InputStream inputStream = request.getContent();
        HtmlParser htmlParser = new HtmlParser();

        Metadata metadata = new Metadata();

        htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());

        List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());

        for (String headerValue : headersContentHandler.getHeaderValues()) {
            columns.add(ColumnMetadata.Builder
                    .column() //
                    .type(Type.STRING) // ATM not doing any complicated type calculation
                    .name(headerValue) //
                    .id(columns.size()) //
                    .build());
        }

        Schema.SheetContent sheetContent = new Schema.SheetContent();
        sheetContent.setColumnMetadatas(columns);

        return Schema.Builder
                .parserResult() //
                .sheetContents(Collections.singletonList(sheetContent)) //
                .draft(false) //
                .build();

    } catch (Exception e) {
        LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
        throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
    }

}

Source File: TearlineContentExtractor.java From baleen with Apache License 2.0

5 votes

@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}

Source File: AlterPDFParser.java From tika-server with Apache License 2.0

5 votes

private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {

    TesseractOCRConfig cfg = new TesseractOCRConfig();
    // here I set default timeout of 2 hours
    // The calling process should check parsing process and terminate it by timeout
    cfg.setTimeout(60 * 60 * 2);
    context.set(TesseractOCRConfig.class, cfg);

    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
    boolean oldExtractInlineImages = config.getExtractInlineImages();
    boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();

    // explicitly tells Tika to use OCR
    config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);

    Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
    Method m = c.getDeclaredMethod("process",
            PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);
    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
    config.setExtractInlineImages(oldExtractInlineImages);
    config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
}

Source File: AlterPDFParser.java From tika-server with Apache License 2.0

5 votes

private void callHandleXFAOnly(PDDocument pdDocument, ContentHandler handler,
                               Metadata metadata, ParseContext context)
        throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    Method m = getClass().getSuperclass().getDeclaredMethod("handleXFAOnly",
            PDDocument.class, ContentHandler.class, Metadata.class, ParseContext.class);
    m.setAccessible(true);
    m.invoke(this, pdDocument, handler, metadata, context);
}

Source File: ExcelHtmlParser.java From components with Apache License 2.0

5 votes

public static List<List<String>> getRows(InputStream rawContent, String encoding, long limit) {
  SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(-1, limit);

  HtmlParser htmlParser = new HtmlParser();
  Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_ENCODING, encoding);
  try {
    htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
  } catch (Exception e) {
    LOGGER.debug("Failed to parse the excel html format document.", e);
  }

  return valuesContentHandler.getValues();
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

private void cachedParse(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                        final ParseContext context, final TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException, InterruptedException {
	final String hash;

	try (final InputStream buffered = Files.newInputStream(tis.getPath())) {
		hash = DigestUtils.sha256Hex(buffered);
	}

	final Path cachePath = outputPath.resolve(hash);
	final Path cacheLock = outputPath.resolve(hash + ".lock");

	// Acquire a lock both for reading and for writing.
	// If the lock can't be acquired, parse without caching.
	if (!acquireLock(config, cacheLock)) {
		fallbackParse(tis, handler, metadata, context, config, inline);
		return;
	}

	// You won't know for sure until you try....
	try (final Reader reader = Files.newBufferedReader(cachePath, UTF_8)) {
		cacheHit();
		readFromCache(reader, handler, metadata);
	} catch (final NoSuchFileException e) {
		final Path cacheTemp = outputPath.resolve(hash + ".tmp");

		// Write to a temporary file and only move to the final path if parsing completes successfully.
		// This way we ensure that we don't cache partial results from Tesseract if there's an error.
		try (final Writer writer = Files.newBufferedWriter(cacheTemp, UTF_8, StandardOpenOption.CREATE)) {
			cacheMiss();
			parseToCache(tis, handler, metadata, context, config, inline, writer);
		}

		Files.move(cacheTemp, cachePath, StandardCopyOption.ATOMIC_MOVE);
	} finally {
		Files.deleteIfExists(cacheLock);
	}
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

@Override
public void parseInline(final InputStream in, final XHTMLContentHandler xhtml, final ParseContext context,
                        final TesseractOCRConfig config)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, xhtml, new Metadata(), context, null == config ?
				context.get(TesseractOCRConfig.class, DEFAULT_CONFIG) : config, true);
	} else {
		super.parseInline(in, xhtml, context, config);
	}
}

Source File: CachingTesseractOCRParser.java From extract with MIT License

5 votes

@Override
public void parse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, handler, metadata, context, context.get(TesseractOCRConfig.class, DEFAULT_CONFIG), false);
	} else {
		super.parse(in, handler, metadata, context);
	}
}

Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0

5 votes

private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}

Source File: TikaIO.java From beam with Apache License 2.0

5 votes

@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}

Source File: ParseContextConfig.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes", "unchecked"})
public ParseContext create() {
  final ParseContext result = new ParseContext();

  for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
    result.set((Class) entry.getKey(), entry.getValue());
  }

  return result;
}

Source File: TikaCallable.java From flink-crawler with Apache License 2.0

5 votes

public TikaCallable(Parser parser, BaseContentExtractor contentExtractor,
        BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata,
        boolean extractLanguage, ParseContext parseContext) {
    _parser = parser;
    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;
    _input = input;
    _metadata = metadata;
    _extractLanguage = extractLanguage;
    _parseContext = parseContext;
}

Source File: WidgetMacroLibraryTests.java From scipio-erp with Apache License 2.0

5 votes

public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = (InputStream) http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
        screenOutString = handler.toString();
    } finally {
        screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
}

Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    // If Tesseract is installed, offer our supported image types
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    if (hasTesseract(config))
        return SUPPORTED_TYPES;

    // Otherwise don't advertise anything, so the other image parsers
    //  can be selected instead
    return Collections.emptySet();
}

Source File: PDFPreprocessorParser.java From CogStack-Pipeline with Apache License 2.0

5 votes

@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    // If ImageMagick is installed, offer our supported image types
    ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
    if (hasImageMagick(imconfig)) {
        return SUPPORTED_TYPES;
    }

    // Otherwise don't advertise anything, so the other parsers
    //  can be selected instead
    return Collections.emptySet();
}

Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License

5 votes

public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}

org.apache.tika.parser.ParseContext Java Examples