org.apache.tika.io.TikaInputStream Java Examples

The following examples show how to use org.apache.tika.io.TikaInputStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TikaTest.java    From tika-server with Apache License 2.0 7 votes vote down vote up
@Override
public void handle(String filename, MediaType mediaType,
                   InputStream stream) {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    if (! stream.markSupported()) {
        stream = TikaInputStream.get(stream);
    }
    stream.mark(0);
    try {
        IOUtils.copy(stream, os);
        bytes.add(os.toByteArray());
        stream.reset();
    } catch (IOException e) {
        //swallow
    }
}
 
Example #2
Source File: UnpackBuilder.java    From kite with Apache License 2.0 6 votes vote down vote up
private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) {
  String name = entry.getName();
  if (archive.canReadEntryData(entry)) {
    Record entrydata = new Record(); // TODO: or pass myself?
    //Record entrydata = record.copy();
    
    // For detectors to work, we need a mark/reset supporting
    // InputStream, which ArchiveInputStream isn't, so wrap
    TemporaryResources tmp = new TemporaryResources();
    try {
      TikaInputStream tis = TikaInputStream.get(archive, tmp);
      return extractor.parseEmbedded(tis, entrydata, name, getChild());
    } finally {
      try {
        tmp.dispose();
      } catch (TikaException e) {
        LOG.warn("Cannot dispose of tmp Tika resources", e);
      }
    }
  } else {
    return false;
  } 
}
 
Example #3
Source File: DefaultMimeSupport.java    From nexus-public with Eclipse Public License 1.0 6 votes vote down vote up
@Nonnull
@Override
public List<String> detectMimeTypes(final InputStream input, @Nullable final String fileName) throws IOException {
  checkNotNull(input);

  List<String> detected = Lists.newArrayList();
  Metadata metadata = new Metadata();
  if (fileName != null) {
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
  }

  MediaType mediaType;
  try (final TikaInputStream tis = TikaInputStream.get(input)) {
    mediaType = detector.detect(tis, metadata);
  }

  // unravel to least specific
  unravel(detected, mediaType);

  if (detected.isEmpty()) {
    detected.add(MimeTypes.OCTET_STREAM);
  }

  return detected;
}
 
Example #4
Source File: EmbedSpawner.java    From extract with MIT License 6 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
Example #5
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(File file) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName());
	try {
		tikaStream = TikaInputStream.get(file, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example #6
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example #7
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}
 
Example #8
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}
 
Example #9
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example #10
Source File: StoredFile.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private void init(String inName, String inExt, InputStream is) {
	if (Strings.isEmpty(inExt)) {
		int idx = inName.lastIndexOf('.');
		name = idx < 0 ? inName : inName.substring(0, idx);
		ext = getFileExt(inName);
	} else {
		name = inName;
		ext = inExt.toLowerCase(Locale.ROOT);
	}
	Metadata md = new Metadata();
	md.add(RESOURCE_NAME_KEY, String.format(FILE_NAME_FMT, name, ext));
	try {
		mime = tika.getDetector().detect(is == null ? null : TikaInputStream.get(is), md);
	} catch (Throwable e) {
		mime = null;
		log.error("Unexpected exception while detecting mime type", e);
	}
}
 
Example #11
Source File: XlsDetector.java    From data-prep with Apache License 2.0 6 votes vote down vote up
/**
 * Reads an input stream and checks if it has a XLS format.
 *
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an XLS format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {

    Format result = null;

    MediaType mediaType = super.detect(inputStream, metadata);
    if (mediaType == null || StringUtils.equals(mediaType.toString(), FormatUtils.UNKNOWN_MEDIA_TYPE)) {
        mediaType = zipContainerDetector.detect(inputStream, new Metadata());
    }

    if (mediaType != null) {
        String mediaTypeName = mediaType.toString();
        if (StringUtils.startsWith(mediaTypeName, XlsFormatFamily.MEDIA_TYPE)
                || StringUtils.equals(mediaTypeName, OLD_XLS_MEDIA_TYPE)) {
            result = new Format(xlsFormatFamily, FormatUtils.DEFAULT_ENCODING);
        }
    }

    return result;
}
 
Example #12
Source File: CSVDetector.java    From data-prep with Apache License 2.0 6 votes vote down vote up
/**
 * Reads an input stream and checks if it has a CSV format.
 *
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an CSV format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {

    Format result = detectText(metadata, inputStream);

    if (result == null) {
        inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
        byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
        int n = 0;

        for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m =
                inputStream.read(buffer, n, buffer.length - n)) {
            n += m;
        }

        inputStream.reset();
        String head = FormatUtils.readFromBuffer(buffer, 0, n);

        try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
            result = detectText(new Metadata(), stream);
        }
    }
    return result;
}
 
Example #13
Source File: DirectoryManifest.java    From genie with Apache License 2.0 6 votes vote down vote up
private String getMimeType(final String name, final Path path) {
    // TODO: Move configuration of special handling cases to external configuration for flexibility
    //       probably a map of filename -> type or extension -> type or produced mime-type -> desired mime-type
    switch (name) {
        case "stdout":
        case "stderr":
        case "run":
            return MediaType.TEXT_PLAIN.toString();
        default:
            try (TikaInputStream inputStream = TikaInputStream.get(path)) {
                return this.tikaConfig.getDetector().detect(inputStream, this.metadata).toString();
            } catch (final IOException ioe) {
                log.error("Unable to detect mime type for {} due to error", path, ioe);
                return MediaType.OCTET_STREAM.toString();
            }
    }
}
 
Example #14
Source File: CompositeFormatDetector.java    From data-prep with Apache License 2.0 5 votes vote down vote up
/**
 * Reads an input stream and detects its format.
 *
 * Note that the stream will not close the specified stream before returning. It is to the responsibility of the
 * caller to close it.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or the detected format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {

    Format result = null;
    for (Detector detector : detectors) {
        result = detector.detect(metadata, inputStream);
        if (result != null)
            break;
    }
    if (result == null) {
        result = new Format(unsupportedFormatFamily, FormatUtils.DEFAULT_ENCODING);
    }
    return result;
}
 
Example #15
Source File: HtmlDetector.java    From data-prep with Apache License 2.0 5 votes vote down vote up
/**
 * Reads an input stream and checks if it has a HTML format.
 * 
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 * 
 * 
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an HTML format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
    if (inputStream == null) {
        return null;
    } else {
        inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
        byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
        int n = 0;

        for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m =
                inputStream.read(buffer, n, buffer.length - n)) {
            n += m;
        }

        inputStream.reset();
        String head = FormatUtils.readFromBuffer(buffer, 0, n);
        try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
            Charset charset = htmlEncodingDetector.detect(stream, metadata);

            if (charset != null) {
                return new Format(htmlFormatFamily, charset.name());
            }
        }
        return null;
    }

}
 
Example #16
Source File: EmbedSpawner.java    From extract with MIT License 5 votes vote down vote up
private void writeEmbed(final TikaInputStream tis, final EmbeddedTikaDocument embed, final String name) throws IOException {
	final Path destination = outputPath.resolve(embed.getHash());
	final Path source;

	final Metadata metadata = embed.getMetadata();
	final Object container = tis.getOpenContainer();

	// If the input is a container, write it to a temporary file so that it can then be copied atomically.
	// This happens with, for example, an Outlook Message that is an attachment of another Outlook Message.
	if (container instanceof DirectoryEntry) {
		try (final TemporaryResources tmp = new TemporaryResources();
		     final POIFSFileSystem fs = new POIFSFileSystem()) {
			source = tmp.createTempFile();
			saveEntries((DirectoryEntry) container, fs.getRoot());

			try (final OutputStream output = Files.newOutputStream(source)) {
				fs.writeFilesystem(output);
			}
		}
	} else {
		source = tis.getPath();
	}

	// Set the content-length as it isn't (always?) set by Tika for embeds.
	if (null == metadata.get(Metadata.CONTENT_LENGTH)) {
		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source)));
	}

	// To prevent massive duplication and because the disk is only a storage for underlying data, save using the
	// straight hash as a filename.
	try {
		Files.copy(source, destination);
	} catch (final FileAlreadyExistsException e) {
		if (Files.size(source) != Files.size(destination)) {
			Files.copy(source, destination, StandardCopyOption.REPLACE_EXISTING);
		} else {
			logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root);
		}
	}
}
 
Example #17
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
private String parseTXTToString(File file) throws IOException, TikaException {
	Metadata metadata = new Metadata();
	InputStream stream = TikaInputStream.get(file.toPath(), metadata);
	try {
		return parseTXTToString(stream, metadata);
	} finally {
		stream.close();
	}
}
 
Example #18
Source File: DirectoryScanner.java    From importer-exporter with Apache License 2.0 5 votes vote down vote up
private MediaType getMediaType(InputStream stream) {
    try {
        return tikaConfig.getDetector().detect(TikaInputStream.get(stream), new Metadata());
    } catch (IOException e) {
        return MediaType.EMPTY;
    }
}
 
Example #19
Source File: DirectoryScanner.java    From importer-exporter with Apache License 2.0 5 votes vote down vote up
private MediaType getMediaType(Path file) {
    try (InputStream stream = TikaInputStream.get(file)) {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString());
        return tikaConfig.getDetector().detect(stream, metadata);
    } catch (IOException e) {
        return MediaType.EMPTY;
    }
}
 
Example #20
Source File: EmbedLinker.java    From extract with MIT License 5 votes vote down vote up
private EmbeddedTikaDocument saveEmbedded(final String name, final InputStream input, final Metadata metadata) throws
		IOException {
	final Path path = tmp.createTemporaryFile().toPath();

	// Add the embedded document to the parent with a key (which is the temporary path) so that it can be looked
	// up later.
	final EmbeddedTikaDocument embed = parent.addEmbed(path.toString(), new PathIdentifier(), path, metadata);

	if ((input instanceof TikaInputStream) && ((TikaInputStream) input).getOpenContainer() != null && (
			(TikaInputStream) input).getOpenContainer() instanceof DirectoryEntry) {
		final POIFSFileSystem fs = new POIFSFileSystem();

		saveEntries((DirectoryEntry) ((TikaInputStream) input).getOpenContainer(), fs.getRoot());

		try (final OutputStream output = Files.newOutputStream(path)) {
			fs.writeFilesystem(output);
		}

		return embed;
	}

	final long copied;

	try {
		copied = Files.copy(input, path, StandardCopyOption.REPLACE_EXISTING);
	} finally {
		input.close();
	}

	if (copied > 0) {
		logger.info("Copied {} bytes from embedded document \"{}\" in \"{}\" to file.",
				copied, name, parent);
	} else {
		logger.warn("No bytes copied for embedded document \"{}\" in \"{}\". "
				+ "This could indicate a downstream error.", name, parent);
	}

	return embed;
}
 
Example #21
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata,
                        final ParseContext context, final TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException, InterruptedException {
	final String hash;

	try (final InputStream buffered = Files.newInputStream(tis.getPath())) {
		hash = DigestUtils.sha256Hex(buffered);
	}

	final Path cachePath = outputPath.resolve(hash);
	final Path cacheLock = outputPath.resolve(hash + ".lock");

	// Acquire a lock both for reading and for writing.
	// If the lock can't be acquired, parse without caching.
	if (!acquireLock(config, cacheLock)) {
		fallbackParse(tis, handler, metadata, context, config, inline);
		return;
	}

	// You won't know for sure until you try....
	try (final Reader reader = Files.newBufferedReader(cachePath, UTF_8)) {
		cacheHit();
		readFromCache(reader, handler, metadata);
	} catch (final NoSuchFileException e) {
		final Path cacheTemp = outputPath.resolve(hash + ".tmp");

		// Write to a temporary file and only move to the final path if parsing completes successfully.
		// This way we ensure that we don't cache partial results from Tesseract if there's an error.
		try (final Writer writer = Files.newBufferedWriter(cacheTemp, UTF_8, StandardOpenOption.CREATE)) {
			cacheMiss();
			parseToCache(tis, handler, metadata, context, config, inline, writer);
		}

		Files.move(cacheTemp, cachePath, StandardCopyOption.ATOMIC_MOVE);
	} finally {
		Files.deleteIfExists(cacheLock);
	}
}
 
Example #22
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}
 
Example #23
Source File: FallbackParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context) throws SAXException, IOException, TikaException {
	final Parser parser;
	final long size;
	String value = metadata.get(Metadata.CONTENT_LENGTH);

	if (null != value && !value.isEmpty()) {
		size = Long.valueOf(value);
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(stream)) {
			size = tis.getLength();
		}

		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	}

	// If the file is not empty, throw a parse error.
	// Otherwise, output an empty document.
	if (size > 0) {
		parser = ErrorParser.INSTANCE;
	} else {
		metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
		parser = EmptyParser.INSTANCE;
	}

	parser.parse(stream, handler, metadata, context);
}
 
Example #24
Source File: EmbeddedExtractor.java    From kite with Apache License 2.0 5 votes vote down vote up
public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
    
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
          newStream.setOpenContainer(container);
        }
      }
      record = record.copy();

      record.replaceValues(Fields.ATTACHMENT_BODY, newStream);
      record.removeAll(Fields.ATTACHMENT_MIME_TYPE);
      record.removeAll(Fields.ATTACHMENT_CHARSET);
      
      record.removeAll(Fields.ATTACHMENT_NAME);
      if (name != null && name.length() > 0) {
        record.put(Fields.ATTACHMENT_NAME, name);
      }
      
      return child.process(record);
//    } catch (RuntimeException e) {
//      
//      // THIS IS THE DIFF WRT ParsingEmbeddedDocumentExtractor
//      throw new MorphlineRuntimeException(e);
//      
//        // TODO: can we log a warning somehow?
//        // Could not parse the entry, just skip the content
    } finally {
      Closeables.closeQuietly(tmp);
    }

  }
 
Example #25
Source File: TikaFilePlace.java    From emissary with Apache License 2.0 5 votes vote down vote up
/**
 * Use the Tika mime type (magic) detector to identify the file type
 *
 * @param d the IBaseDataObject payload to evaluate
 * @return mediaType
 */
private MediaType detectType(IBaseDataObject d) throws Exception {
    Metadata metadata = new Metadata();
    InputStream input = TikaInputStream.get(d.data(), metadata);
    appendFilenameMimeTypeSupport(d, metadata);
    MediaType mediaType = mimeTypes.detect(input, metadata);
    logger.debug("Tika type: " + mediaType.toString());
    return mediaType;
}
 
Example #26
Source File: TikaPoweredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * There seems to be some sort of issue with some downstream
 *  3rd party libraries, and input streams that come from
 *  a {@link ContentReader}. This happens most often with
 *  JPEG and Tiff files.
 * For these cases, buffer out to a local file if not
 *  already there
 */
protected InputStream getInputStream(ContentReader reader) throws IOException
{
   // Prefer the File if available, it's generally quicker
   if(reader instanceof FileContentReader) 
   {
      return TikaInputStream.get( ((FileContentReader)reader).getFile() );
   }
   
   // Grab the InputStream for the Content
   InputStream input = reader.getContentInputStream();
   
   // Images currently always require a file
   if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(reader.getMimetype()) ||
      MimetypeMap.MIMETYPE_IMAGE_TIFF.equals(reader.getMimetype())) 
   {
      TemporaryResources tmp = new TemporaryResources();
      TikaInputStream stream = TikaInputStream.get(input, tmp);
      stream.getFile(); // Have it turned into File backed
      return stream;
   }
   else
   {
      // The regular Content InputStream should be fine
      return input; 
   }
}
 
Example #27
Source File: MediaTypeValidator.java    From iaf with Apache License 2.0 5 votes vote down vote up
/**
 * Detects media type from input stream
 * 
 * @param inputStream
 * @param filename
 * @return
 * @throws IOException
 */
public MediaType getMediaType(InputStream inputStream, String filename) throws IOException {
	// Create every time as TemporaryResources is not thread-safe
	TemporaryResources tmp = new TemporaryResources();
	tmp.setTemporaryFileDirectory(Paths.get(pdfOutputlocation));
	try (TikaInputStream tis = TikaInputStream.get(inputStream, tmp)) {
		String type = tika.detect(tis, filename);
		return MediaType.parse(type);
	}
}
 
Example #28
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	// Append the charset if the content-type was determined.
	if (charset != null && !charset.isEmpty()) {
		return contentType + "; charset=" + charset;
	}

	return contentType;
}
 
Example #29
Source File: MimetypeMap.java    From alfresco-data-model with GNU Lesser General Public License v3.0 5 votes vote down vote up
private MediaType detectType(String filename, InputStream input)
{
	TikaInputStream inp = null;
    if (input != null)
    {
    	inp = TikaInputStream.get(input);
    }
    return detectType(filename, inp);
}
 
Example #30
Source File: TransportAmazonLambdaS3.java    From github-bucket with ISC License 5 votes vote down vote up
@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}