Java Code Examples for org.apache.tika.io.TikaInputStream#close()

The following examples show how to use org.apache.tika.io.TikaInputStream#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}
 
Example 2
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 3
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(File file) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName());
	try {
		tikaStream = TikaInputStream.get(file, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 4
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example 5
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}
 
Example 6
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	// Append the charset if the content-type was determined.
	if (charset != null && !charset.isEmpty()) {
		return contentType + "; charset=" + charset;
	}

	return contentType;
}
 
Example 7
Source File: ExtractMediaMetadata.java    From nifi with Apache License 2.0 4 votes vote down vote up
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                                       Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    try {
        autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
    } finally {
        tikaInputStream.close();
    }

    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }

        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}
 
Example 8
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}
 
Example 9
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String detectCharset(String uri, String contentType) throws FileNotFoundException, IOException, TikaException {
	final Metadata metadata = new Metadata();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	return charset;
}