java source code of DataURIEncodingInputStream

package org.icij.extract.io;

import java.util.Map;

import java.io.Reader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.IOException;

import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.exception.TikaException;

import org.apache.commons.codec.binary.Base64InputStream;

/**
 * An {@link InputStream} that encodes arbitrary binary data from an input stream in data URI format.
 *
 * This object is thread-safe.
 *
 * @since 1.0.0-beta
 */
public class DataURIEncodingInputStream extends InputStream {

	private static MediaType detectType(final Path path, final Metadata metadata) throws IOException {
		MediaType type = null;

		// There seems to be some confusion in Tika about which key to use.
		String orig = metadata.get(Metadata.CONTENT_TYPE);
		if (null == orig || orig.isEmpty()) {
			orig = metadata.get(TikaCoreProperties.TYPE);
		}

		if (null != orig) {
			type = MediaType.parse(orig);
		}

		if (null == type) {
			try (
				final InputStream input = new BufferedInputStream(Files.newInputStream(path))
			) {
				type = new DefaultDetector().detect(input, metadata);
			}
		}

		// If the type is text, detect the charset if it's missing from
		// the media type and add it to it as a param.
		if (type.getType().equals("text")) {
			final Map<String, String> parameters = type.getParameters();

			if (null != parameters.get("charset")) {
				return type;
			}

			final Charset charset = detectCharset(path, metadata);

			if (null != charset) {
				parameters.put("charset", charset.name());
				type = new MediaType(type.getBaseType(), parameters);
			}
		}

		return type;
	}

	private static Charset detectCharset(final Path path, final Metadata metadata) throws IOException {
		final Charset charset;

		// Try to parse the character set from the content-encoding.
		String orig = metadata.get(Metadata.CONTENT_ENCODING);

		// Try to detect the character set.
		if (null != orig && Charset.isSupported(orig)) {
			return Charset.forName(orig);
		}

		try (
			final InputStream input = new BufferedInputStream(Files.newInputStream(path));
			final AutoDetectReader detector = new AutoDetectReader(input, metadata)
		) {
			charset = detector.getCharset();
		} catch (TikaException e) {
			throw new IOException("Unable to detect charset.", e);
		}

		return charset;
	}

	public static Reader createReader(final Path path, final Metadata metadata) throws IOException {
		return new InputStreamReader(new DataURIEncodingInputStream(path, metadata), StandardCharsets.US_ASCII);
	}

	private final InputStream encoder;
	private final byte[] prepend;
	private int position = 0;

	public DataURIEncodingInputStream(final Path path, final Metadata metadata) throws IOException {
		this(new BufferedInputStream(Files.newInputStream(path)), detectType(path, metadata));
	}

	public DataURIEncodingInputStream(final InputStream in, final MediaType type) {

		// Only text documents should be URL-encoded. It doesn't matter if the encoding is supported or not because
		// the URL-io works on raw bytes. Everything else must be base-64-encoded.
		if (type.getType().equals("text")) {
			this.prepend = ("data:" + type + ",").getBytes(StandardCharsets.US_ASCII);
			this.encoder = new URLEncodingInputStream(in);
		} else {
			this.prepend = ("data:" + type + ";base64,").getBytes(StandardCharsets.US_ASCII);
			this.encoder = new Base64InputStream(in, true, -1, null);
		}
	}

	@Override
	public int read() throws IOException {
		if (position < prepend.length) {
			return prepend[position++];
		} else {
			return encoder.read();
		}
	}

	@Override
	public int available() throws IOException {
		if (position < prepend.length) {
			return prepend.length - position;
		} else {
			return encoder.available();
		}
	}

	@Override
	public void close() throws IOException {
		encoder.close();
	}
}