org.apache.tika.sax.XHTMLContentHandler Java Examples

The following examples show how to use org.apache.tika.sax.XHTMLContentHandler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the contents of the given stream and write it to the given XHTML
 * content handler. The stream is closed once fully processed.
 *
 * @param stream
 *          Stream where is the result of ocr
 * @param xhtml
 *          XHTML content handler
 * @throws SAXException
 *           if the XHTML SAX events could not be handled
 * @throws IOException
 *           if an input error occurred
 */
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {

    xhtml.startDocument();
    xhtml.startElement("div");
    try (Reader reader = new InputStreamReader(stream, UTF_8)) {
        char[] buffer = new char[1024];
        for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
            if (n > 0)
                xhtml.characters(buffer, 0, n);
        }
    }
    xhtml.endElement("div");
    xhtml.endDocument();
}
 
Example #2
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parseInline(final InputStream in, final XHTMLContentHandler xhtml, final ParseContext context,
                        final TesseractOCRConfig config)
		throws IOException, SAXException, TikaException {
	if (null != outputPath) {
		cachedParse(in, xhtml, new Metadata(), context, null == config ?
				context.get(TesseractOCRConfig.class, DEFAULT_CONFIG) : config, true);
	} else {
		super.parseInline(in, xhtml, context, config);
	}
}
 
Example #3
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                          final ParseContext context, final TesseractOCRConfig config, final boolean inline,
                          final Writer writer) throws SAXException, IOException, TikaException {
	final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer));

	if (inline) {
		super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config);
	} else {
		super.parse(tis, tee, metadata, context);
	}
}
 
Example #4
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void readFully(final Reader reader, final XHTMLContentHandler xhtml) throws IOException, SAXException {
	final char[] buffer = new char[1024];

	for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
		if (n > 0) {
			xhtml.characters(buffer, 0, n);
		}
	}
}
 
Example #5
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 4 votes vote down vote up
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (! hasTesseract(config))
        return;

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    TemporaryResources tmp = new TemporaryResources();
    File output = null;
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        long size = tikaStream.getLength();

        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

            output = tmp.createTemporaryFile();
            doOCR(input, output, config);

            // Tesseract appends .txt to output file name
            output = new File(output.getAbsolutePath() + ".txt");

            if (output.exists())
                extractOutput(new FileInputStream(output), xhtml);

        }

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (output != null) {
            output.delete();
        }
    }
}