/* * Copyright (c) 2013 The Center for Public Integrity®. See LICENSE. */ package org.icij.nodetika; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.io.IOException; import java.io.FileNotFoundException; import java.io.File; import java.io.InputStream; import java.io.FileInputStream; import java.io.ByteArrayOutputStream; import java.io.OutputStreamWriter; import java.net.URL; import java.net.URLConnection; import java.net.MalformedURLException; import java.lang.Exception; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.HttpHeaders; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.mime.MediaType; import org.apache.tika.detect.Detector; import org.apache.tika.detect.AutoDetectReader; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.ContentHandler; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.OutputKeys; import javax.xml.transform.stream.StreamResult; import com.google.gson.Gson; public class NodeTika { private static final TikaConfig config = TikaConfig.getDefaultConfig(); private static TikaInputStream createInputStream(String uri) throws FileNotFoundException, MalformedURLException, IOException { return createInputStream(uri, null); } private static TikaInputStream createInputStream(String uri, Metadata metadata) throws FileNotFoundException, MalformedURLException, IOException { InputStream inputStream; if (uri.startsWith("http://") || uri.startsWith("https://") || uri.startsWith("ftp://")) { final URLConnection urlConnection = new URL(uri).openConnection(); // If a metadata object was passed, fill it with the content-type returned from the server. if (metadata != null) { fillMetadata(metadata, urlConnection.getContentType()); } inputStream = urlConnection.getInputStream(); } else { inputStream = new FileInputStream(uri); } return TikaInputStream.get(inputStream); } private static AutoDetectParser createParser() { final AutoDetectParser parser = new AutoDetectParser(config); parser.setFallback(new Parser() { public Set<MediaType> getSupportedTypes(ParseContext parseContext) { return parser.getSupportedTypes(parseContext); } public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws TikaException { throw new TikaException("Unsupported Media Type: " + metadata.get(HttpHeaders.CONTENT_TYPE)); } }); return parser; } private static void fillMetadata(AutoDetectParser parser, Metadata metadata, String contentType, String uri) { fillMetadata(metadata, contentType, uri); final Detector detector = parser.getDetector(); parser.setDetector(new Detector() { public MediaType detect(InputStream inputStream, Metadata metadata) throws IOException { String contentType = metadata.get(HttpHeaders.CONTENT_TYPE); if (contentType != null) { return MediaType.parse(contentType); } else { return detector.detect(inputStream, metadata); } } }); } private static void fillMetadata(Metadata metadata, String contentType) { fillMetadata(metadata, contentType, null); } private static void fillMetadata(Metadata metadata, String contentType, String uri) { // Set the file name. if (uri != null) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); } // Normalise the content-type. contentType = normalizeContentType(contentType); // Set the content-type. if (contentType != null) { metadata.add(HttpHeaders.CONTENT_TYPE, contentType); } } private static String normalizeContentType(String contentType) { if (contentType == null) { return null; } // URLConnection returns content/unknown as the default content-type. if (contentType.equals("content/unknown")) { return null; } if (contentType.equals(MediaType.OCTET_STREAM)) { return null; } if ("xml".equals(MediaType.parse(contentType).getSubtype())) { return null; } return contentType; } private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) { final TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); if (options == null) { // Disable OCR and return if no options are specified. disableOcr(ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig); return; } fillOcrOptions(ocrConfig, options); parseContext.set(TesseractOCRConfig.class, ocrConfig); final PDFParserConfig pdfParserConfig = new PDFParserConfig(); fillPdfOptions(pdfParserConfig, options); parseContext.set(PDFParserConfig.class, pdfParserConfig); // Allow a password to be specified for encrypted files. fillPassword(parseContext, options); } private static void fillPassword(ParseContext parseContext, Map<String, Object> options) { final Object password = options.get("password"); if (password == null) { return; } parseContext.set(PasswordProvider.class, new PasswordProvider() { @Override public String getPassword(Metadata metadata) { return password.toString(); } }); } private static void fillPdfOptions(PDFParserConfig pdfParserConfig, Map<String, Object> options) { final Object averageCharTolerance = options.get("pdfAverageCharTolerance"); final Object enableAutoSpace = options.get("pdfEnableAutoSpace"); final Object extractAcroFormContent = options.get("pdfExtractAcroFormContent"); final Object extractAnnotationText = options.get("pdfExtractAnnotationText"); final Object extractInlineImages = options.get("pdfExtractInlineImages"); final Object extractUniqueInlineImagesOnly = options.get("pdfExtractUniqueInlineImagesOnly"); final Object sortByPosition = options.get("pdfSortByPosition"); final Object spacingTolerance = options.get("pdfSpacingTolerance"); final Object suppressDuplicateOverlappingText = options.get("pdfSuppressDuplicateOverlappingText"); if (averageCharTolerance != null) { pdfParserConfig.setAverageCharTolerance(Float.parseFloat(averageCharTolerance.toString())); } if (enableAutoSpace != null) { pdfParserConfig.setEnableAutoSpace((Boolean) enableAutoSpace); } if (extractAcroFormContent != null) { pdfParserConfig.setExtractAcroFormContent((Boolean) extractAcroFormContent); } if (extractAnnotationText != null) { pdfParserConfig.setExtractAnnotationText((Boolean) extractAnnotationText); } if (extractInlineImages != null) { pdfParserConfig.setExtractInlineImages((Boolean) extractInlineImages); } else { pdfParserConfig.setExtractInlineImages(true); } if (extractUniqueInlineImagesOnly != null) { pdfParserConfig.setExtractUniqueInlineImagesOnly((Boolean) extractUniqueInlineImagesOnly); } if (sortByPosition != null) { pdfParserConfig.setSortByPosition((Boolean) sortByPosition); } if (spacingTolerance != null) { pdfParserConfig.setSpacingTolerance(Float.parseFloat(spacingTolerance.toString())); } if (suppressDuplicateOverlappingText != null) { pdfParserConfig.setSuppressDuplicateOverlappingText((Boolean) suppressDuplicateOverlappingText); } } private static void fillOcrOptions(TesseractOCRConfig ocrConfig, Map<String, Object> options) { // Only set the OCR config object on the context if the language is specified. // OCR is disabled by default as it can give unexpected results. final Object ocrLanguage = options.get("ocrLanguage"); if (ocrLanguage == null) { disableOcr(ocrConfig); return; } ocrConfig.setLanguage(ocrLanguage.toString()); final Object ocrPath = options.get("ocrPath"); final Object ocrMaxFileSize = options.get("ocrMaxFileSize"); final Object ocrMinFileSize = options.get("ocrMinFileSize"); final Object ocrPageSegmentationMode = options.get("ocrPageSegmentationMode"); final Object ocrTimeout = options.get("ocrTimeout"); if (ocrPath != null) { ocrConfig.setTesseractPath(ocrPath.toString()); } if (ocrMaxFileSize != null) { ocrConfig.setMaxFileSizeToOcr(Integer.parseInt(ocrMaxFileSize.toString())); } if (ocrMinFileSize != null) { ocrConfig.setMinFileSizeToOcr(Integer.parseInt(ocrMinFileSize.toString())); } if (ocrPageSegmentationMode != null) { ocrConfig.setPageSegMode(ocrPageSegmentationMode.toString()); } if (ocrTimeout != null) { ocrConfig.setTimeout(Integer.parseInt(ocrTimeout.toString())); } } private static void disableOcr(TesseractOCRConfig ocrConfig) { // This is necessary until Tika introduces a way to blacklist parsers. // See https://issues.apache.org/jira/browse/TIKA-1557 if (System.getProperty("os.name").startsWith("Windows")) { ocrConfig.setTesseractPath("\\Device\\Null\\"); } else { ocrConfig.setTesseractPath("/dev/null/"); } } public static String extractText(String uri, String optionsJson) throws Exception { Map<String, Object> options = null; if (optionsJson != null) { options = new Gson().fromJson(optionsJson, HashMap.class); } return extractText(uri, options); } public static String extractText(String uri, Map<String, Object> options) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); String outputEncoding = null; String contentType = null; int maxLength = -1; if (options != null) { Object option; option = options.get("outputEncoding"); if (option != null) { outputEncoding = option.toString(); } option = options.get("contentType"); if (option != null) { contentType = option.toString(); } option = options.get("maxLength"); if (option != null) { maxLength = (int)Float.parseFloat(option.toString()); } } if (outputEncoding == null) { outputEncoding = "UTF-8"; } fillMetadata(parser, metadata, contentType, uri); fillParseContext(context, options); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding); final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength); final TikaInputStream inputStream = createInputStream(uri, metadata); // Set up recursive parsing of archives. // See: http://wiki.apache.org/tika/RecursiveMetadata context.set(Parser.class, parser); context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context)); try { parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context); } catch (Throwable e) { if (!contentHandler.isWriteLimitReached(e)) { throw e; } else { writer.close(); } } finally { inputStream.close(); } return outputStream.toString(outputEncoding); } public static String extractXml(String uri, String outputFormat, String optionsJson) throws Exception { Map<String, Object> options = null; if (optionsJson != null) { options = new Gson().fromJson(optionsJson, HashMap.class); } return extractXml(uri, outputFormat, options); } public static String extractXml(String uri, String outputFormat, Map<String, Object> options) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); final ParseContext context = new ParseContext(); String outputEncoding = null; String contentType = null; if (options != null) { Object option; option = options.get("outputEncoding"); if (option != null) { outputEncoding = option.toString(); } option = options.get("contentType"); if (option != null) { contentType = option.toString(); } } if (outputEncoding == null) { outputEncoding = "UTF-8"; } fillMetadata(parser, metadata, contentType, uri); fillParseContext(context, options); final ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding); ContentHandler content; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, outputFormat); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, outputEncoding); handler.setResult(new StreamResult(writer)); content = new ExpandedTitleContentHandler(handler); final TikaInputStream inputStream = createInputStream(uri, metadata); // Set up recursive parsing of archives. // See: http://wiki.apache.org/tika/RecursiveMetadata context.set(Parser.class, parser); try { parser.parse(inputStream, content, metadata, context); } catch (Throwable e) { throw e; } finally { inputStream.close(); } return outputStream.toString(outputEncoding); } public static String extractMeta(String uri) throws Exception { return extractMeta(uri, null); } public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); } public static String detectCharset(String uri) throws FileNotFoundException, IOException, TikaException { return detectCharset(uri, null); } public static String detectCharset(String uri, String contentType) throws FileNotFoundException, IOException, TikaException { final Metadata metadata = new Metadata(); // Use metadata to provide type-hinting to the AutoDetectReader. fillMetadata(metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); // Detect the character set. final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata); String charset = reader.getCharset().toString(); inputStream.close(); return charset; } public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } return contentType; } public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); // Use metadata to provide type-hinting to the AutoDetectReader. fillMetadata(metadata, contentType, uri); // Detect the character set. final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata); String charset = reader.getCharset().toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } // Append the charset if the content-type was determined. if (charset != null && !charset.isEmpty()) { return contentType + "; charset=" + charset; } return contentType; } public static String detectLanguage(String text) { LanguageIdentifier identifier = new LanguageIdentifier(text); Map language = new HashMap(); language.put("language", identifier.getLanguage()); language.put("reasonablyCertain", identifier.isReasonablyCertain()); return new Gson().toJson(language); } }