org.apache.tika.parser.ParseContext Java Examples

The following examples show how to use org.apache.tika.parser.ParseContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example #2
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 6 votes vote down vote up
private void callPDF2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config, boolean noOCR) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    Class c = Class.forName("org.apache.tika.parser.pdf.PDF2XHTML");
    Method m = c.getDeclaredMethod("process", PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);


    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();

    config.setOcrStrategy(noOCR ? PDFParserConfig.OCR_STRATEGY.NO_OCR
            : PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION);

    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
}
 
Example #3
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}
 
Example #4
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@Test
public void offersNoTypesIfNotFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType pdf = MediaType.application("pdf");

    // With an invalid path, will offer no types
    ImageMagickConfig invalidConfig = new ImageMagickConfig();
    invalidConfig.setImageMagickPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(ImageMagickConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}
 
Example #5
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@Test
public void offersTypesIfFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    //DefaultParser defaultParser = new DefaultParser();

    ParseContext parseContext = new ParseContext();
    MediaType pdf = MediaType.application("pdf");

    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
    assumeTrue(canRun());

    assertEquals(1, parser.getSupportedTypes(parseContext).size());
    assertTrue(parser.getSupportedTypes(parseContext).contains(pdf));

    // DefaultParser will not select the PDFPreprocessorParser, unless configured in tika config
    //assertEquals(PDFPreprocessorParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}
 
Example #6
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 6 votes vote down vote up
/**
 * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the
 * default set.
 * 
 * @return
 */
private ParseContext makeParseContext() {
    ParseContext result = new ParseContext();

    Set<String> validTags = _linkExtractor.getLinkTags();
    HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
    for (String tag : validTags) {
        if (defaultMapper.mapSafeElement(tag) == null) {
            result.set(HtmlMapper.class,
                    new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
            break;
        }
    }

    return result;
}
 
Example #7
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) {
	final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

	if (options == null) {

		// Disable OCR and return if no options are specified.
		disableOcr(ocrConfig);
		parseContext.set(TesseractOCRConfig.class, ocrConfig);

		return;
	}

	fillOcrOptions(ocrConfig, options);
	parseContext.set(TesseractOCRConfig.class, ocrConfig);

	final PDFParserConfig pdfParserConfig = new PDFParserConfig();
	fillPdfOptions(pdfParserConfig, options);
	parseContext.set(PDFParserConfig.class, pdfParserConfig);

	// Allow a password to be specified for encrypted files.
	fillPassword(parseContext, options);
}
 
Example #8
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example #9
Source File: TikaContentExtractor.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}
 
Example #10
Source File: ImageConverter.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}
 
Example #11
Source File: TikaExtractor.java    From ache with Apache License 2.0 6 votes vote down vote up
public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}
 
Example #12
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void extractAndCheckMetadata(Metadata metadata, ParseContext context, PDFParserConfig localConfig, PDDocument pdfDocument)
        throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, AccessPermissionException {
    metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
    metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
    callExtractMetadata(pdfDocument, metadata, context);

    AccessChecker checker = localConfig.getAccessChecker();
    checker.check(metadata);
}
 
Example #13
Source File: HtmlSchemaParser.java    From data-prep with Apache License 2.0 5 votes vote down vote up
/**
 * @see SchemaParser#parse(Request)
 */
@Override
public Schema parse(Request request) {

    try {
        SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler();

        InputStream inputStream = request.getContent();
        HtmlParser htmlParser = new HtmlParser();

        Metadata metadata = new Metadata();

        htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext());

        List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size());

        for (String headerValue : headersContentHandler.getHeaderValues()) {
            columns.add(ColumnMetadata.Builder
                    .column() //
                    .type(Type.STRING) // ATM not doing any complicated type calculation
                    .name(headerValue) //
                    .id(columns.size()) //
                    .build());
        }

        Schema.SheetContent sheetContent = new Schema.SheetContent();
        sheetContent.setColumnMetadatas(columns);

        return Schema.Builder
                .parserResult() //
                .sheetContents(Collections.singletonList(sheetContent)) //
                .draft(false) //
                .build();

    } catch (Exception e) {
        LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e);
        throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e);
    }

}
 
Example #14
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example #15
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {

    TesseractOCRConfig cfg = new TesseractOCRConfig();
    // here I set default timeout of 2 hours
    // The calling process should check parsing process and terminate it by timeout
    cfg.setTimeout(60 * 60 * 2);
    context.set(TesseractOCRConfig.class, cfg);

    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
    boolean oldExtractInlineImages = config.getExtractInlineImages();
    boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();

    // explicitly tells Tika to use OCR
    config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);

    Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
    Method m = c.getDeclaredMethod("process",
            PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);
    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
    config.setExtractInlineImages(oldExtractInlineImages);
    config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
}
 
Example #16
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void callHandleXFAOnly(PDDocument pdDocument, ContentHandler handler,
                               Metadata metadata, ParseContext context)
        throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    Method m = getClass().getSuperclass().getDeclaredMethod("handleXFAOnly",
            PDDocument.class, ContentHandler.class, Metadata.class, ParseContext.class);
    m.setAccessible(true);
    m.invoke(this, pdDocument, handler, metadata, context);
}
 
Example #17
Source File: ExcelHtmlParser.java    From components with Apache License 2.0 5 votes vote down vote up
public static List<List<String>> getRows(InputStream rawContent, String encoding, long limit) {
  SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(-1, limit);

  HtmlParser htmlParser = new HtmlParser();
  Metadata metadata = new Metadata();
  metadata.add(Metadata.CONTENT_ENCODING, encoding);
  try {
    htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext());
  } catch (Exception e) {
    LOGGER.debug("Failed to parse the excel html format document.", e);
  }

  return valuesContentHandler.getValues();
}
 
Example #18
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                        final ParseContext context, final TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException, InterruptedException {
	final String hash;

	try (final InputStream buffered = Files.newInputStream(tis.getPath())) {
		hash = DigestUtils.sha256Hex(buffered);
	}

	final Path cachePath = outputPath.resolve(hash);
	final Path cacheLock = outputPath.resolve(hash + ".lock");

	// Acquire a lock both for reading and for writing.
	// If the lock can't be acquired, parse without caching.
	if (!acquireLock(config, cacheLock)) {
		fallbackParse(tis, handler, metadata, context, config, inline);
		return;
	}

	// You won't know for sure until you try....
	try (final Reader reader = Files.newBufferedReader(cachePath, UTF_8)) {
		cacheHit();
		readFromCache(reader, handler, metadata);
	} catch (final NoSuchFileException e) {
		final Path cacheTemp = outputPath.resolve(hash + ".tmp");

		// Write to a temporary file and only move to the final path if parsing completes successfully.
		// This way we ensure that we don't cache partial results from Tesseract if there's an error.
		try (final Writer writer = Files.newBufferedWriter(cacheTemp, UTF_8, StandardOpenOption.CREATE)) {
			cacheMiss();
			parseToCache(tis, handler, metadata, context, config, inline, writer);
		}

		Files.move(cacheTemp, cachePath, StandardCopyOption.ATOMIC_MOVE);
	} finally {
		Files.deleteIfExists(cacheLock);
	}
}
 
Example #19
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}
 
Example #20
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}
 
Example #21
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parseInline(final InputStream in, final XHTMLContentHandler xhtml, final ParseContext context,
                        final TesseractOCRConfig config)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, xhtml, new Metadata(), context, null == config ?
				context.get(TesseractOCRConfig.class, DEFAULT_CONFIG) : config, true);
	} else {
		super.parseInline(in, xhtml, context, config);
	}
}
 
Example #22
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, handler, metadata, context, context.get(TesseractOCRConfig.class, DEFAULT_CONFIG), false);
	} else {
		super.parse(in, handler, metadata, context);
	}
}
 
Example #23
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example #24
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example #25
Source File: ParseContextConfig.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
public ParseContext create() {
  final ParseContext result = new ParseContext();

  for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
    result.set((Class) entry.getKey(), entry.getValue());
  }

  return result;
}
 
Example #26
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
public TikaCallable(Parser parser, BaseContentExtractor contentExtractor,
        BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata,
        boolean extractLanguage, ParseContext parseContext) {
    _parser = parser;
    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;
    _input = input;
    _metadata = metadata;
    _extractLanguage = extractLanguage;
    _parseContext = parseContext;
}
 
Example #27
Source File: WidgetMacroLibraryTests.java    From scipio-erp with Apache License 2.0 5 votes vote down vote up
public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = (InputStream) http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
        screenOutString = handler.toString();
    } finally {
        screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
}
 
Example #28
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    // If Tesseract is installed, offer our supported image types
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
    if (hasTesseract(config))
        return SUPPORTED_TYPES;

    // Otherwise don't advertise anything, so the other image parsers
    //  can be selected instead
    return Collections.emptySet();
}
 
Example #29
Source File: PDFPreprocessorParser.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
    // If ImageMagick is installed, offer our supported image types
    ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
    if (hasImageMagick(imconfig)) {
        return SUPPORTED_TYPES;
    }

    // Otherwise don't advertise anything, so the other parsers
    //  can be selected instead
    return Collections.emptySet();
}
 
Example #30
Source File: EmbeddedDocumentMemoryExtractor.java    From extract with MIT License 5 votes vote down vote up
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}