Java Code Examples for org.apache.tika.io.TikaInputStream#get()

The following examples show how to use org.apache.tika.io.TikaInputStream#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaTest.java    From tika-server with Apache License 2.0 7 votes vote down vote up
@Override
public void handle(String filename, MediaType mediaType,
                   InputStream stream) {
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    if (! stream.markSupported()) {
        stream = TikaInputStream.get(stream);
    }
    stream.mark(0);
    try {
        IOUtils.copy(stream, os);
        bytes.add(os.toByteArray());
        stream.reset();
    } catch (IOException e) {
        //swallow
    }
}
 
Example 2
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 3
Source File: DirectoryManifest.java    From genie with Apache License 2.0 6 votes vote down vote up
private String getMimeType(final String name, final Path path) {
    // TODO: Move configuration of special handling cases to external configuration for flexibility
    //       probably a map of filename -> type or extension -> type or produced mime-type -> desired mime-type
    switch (name) {
        case "stdout":
        case "stderr":
        case "run":
            return MediaType.TEXT_PLAIN.toString();
        default:
            try (TikaInputStream inputStream = TikaInputStream.get(path)) {
                return this.tikaConfig.getDetector().detect(inputStream, this.metadata).toString();
            } catch (final IOException ioe) {
                log.error("Unable to detect mime type for {} due to error", path, ioe);
                return MediaType.OCTET_STREAM.toString();
            }
    }
}
 
Example 4
Source File: UnpackBuilder.java    From kite with Apache License 2.0 6 votes vote down vote up
private boolean parseEntry(ArchiveInputStream archive, ArchiveEntry entry, EmbeddedExtractor extractor, Record record) {
  String name = entry.getName();
  if (archive.canReadEntryData(entry)) {
    Record entrydata = new Record(); // TODO: or pass myself?
    //Record entrydata = record.copy();
    
    // For detectors to work, we need a mark/reset supporting
    // InputStream, which ArchiveInputStream isn't, so wrap
    TemporaryResources tmp = new TemporaryResources();
    try {
      TikaInputStream tis = TikaInputStream.get(archive, tmp);
      return extractor.parseEmbedded(tis, entrydata, name, getChild());
    } finally {
      try {
        tmp.dispose();
      } catch (TikaException e) {
        LOG.warn("Cannot dispose of tmp Tika resources", e);
      }
    }
  } else {
    return false;
  } 
}
 
Example 5
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
        SAXException, TikaException {

    TemporaryResources tmp = new TemporaryResources();
    FileOutputStream fos = null;
    TikaInputStream tis = null;
    try {
        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        File file = tmp.createTemporaryFile();
        fos = new FileOutputStream(file);
        ImageIO.write(bImage, "png", fos);
        tis = TikaInputStream.get(file);
        parse(tis, handler, metadata, context);

    } finally {
        tmp.dispose();
        if (tis != null)
            tis.close();
        if (fos != null)
            fos.close();
    }

}
 
Example 6
Source File: EmbedSpawner.java    From extract with MIT License 6 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
Example 7
Source File: MimetypeMap.java    From alfresco-data-model with GNU Lesser General Public License v3.0 5 votes vote down vote up
private MediaType detectType(String filename, InputStream input)
{
	TikaInputStream inp = null;
    if (input != null)
    {
    	inp = TikaInputStream.get(input);
    }
    return detectType(filename, inp);
}
 
Example 8
Source File: TransportAmazonLambdaS3.java    From github-bucket with ISC License 5 votes vote down vote up
@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}
 
Example 9
Source File: RepositoryS3.java    From github-bucket with ISC License 5 votes vote down vote up
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}
 
Example 10
Source File: TikaPoweredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * There seems to be some sort of issue with some downstream
 *  3rd party libraries, and input streams that come from
 *  a {@link ContentReader}. This happens most often with
 *  JPEG and Tiff files.
 * For these cases, buffer out to a local file if not
 *  already there
 */
protected InputStream getInputStream(ContentReader reader) throws IOException
{
   // Prefer the File if available, it's generally quicker
   if(reader instanceof FileContentReader) 
   {
      return TikaInputStream.get( ((FileContentReader)reader).getFile() );
   }
   
   // Grab the InputStream for the Content
   InputStream input = reader.getContentInputStream();
   
   // Images currently always require a file
   if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(reader.getMimetype()) ||
      MimetypeMap.MIMETYPE_IMAGE_TIFF.equals(reader.getMimetype())) 
   {
      TemporaryResources tmp = new TemporaryResources();
      TikaInputStream stream = TikaInputStream.get(input, tmp);
      stream.getFile(); // Have it turned into File backed
      return stream;
   }
   else
   {
      // The regular Content InputStream should be fine
      return input; 
   }
}
 
Example 11
Source File: HtmlDetector.java    From data-prep with Apache License 2.0 5 votes vote down vote up
/**
 * Reads an input stream and checks if it has a HTML format.
 * 
 * The general contract of a detector is to not close the specified stream before returning. It is to the
 * responsibility of the caller to close it. The detector should leverage the mark/reset feature of the specified
 * {@see TikaInputStream} in order to let the stream always return the same bytes.
 * 
 * 
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an HTML format
 * @throws IOException
 */
@Override
public Format detect(Metadata metadata, TikaInputStream inputStream) throws IOException {
    if (inputStream == null) {
        return null;
    } else {
        inputStream.mark(FormatUtils.META_TAG_BUFFER_SIZE);
        byte[] buffer = new byte[FormatUtils.META_TAG_BUFFER_SIZE];
        int n = 0;

        for (int m = inputStream.read(buffer); m != -1 && n < buffer.length; m =
                inputStream.read(buffer, n, buffer.length - n)) {
            n += m;
        }

        inputStream.reset();
        String head = FormatUtils.readFromBuffer(buffer, 0, n);
        try (InputStream stream = TikaInputStream.get(IOUtils.toInputStream(head))) {
            Charset charset = htmlEncodingDetector.detect(stream, metadata);

            if (charset != null) {
                return new Format(htmlFormatFamily, charset.name());
            }
        }
        return null;
    }

}
 
Example 12
Source File: TransportAmazonLambdaS3.java    From github-bucket with ISC License 5 votes vote down vote up
@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}
 
Example 13
Source File: MediaTypeValidator.java    From iaf with Apache License 2.0 5 votes vote down vote up
/**
 * Detects media type from input stream
 * 
 * @param inputStream
 * @param filename
 * @return
 * @throws IOException
 */
public MediaType getMediaType(InputStream inputStream, String filename) throws IOException {
	// Create every time as TemporaryResources is not thread-safe
	TemporaryResources tmp = new TemporaryResources();
	tmp.setTemporaryFileDirectory(Paths.get(pdfOutputlocation));
	try (TikaInputStream tis = TikaInputStream.get(inputStream, tmp)) {
		String type = tika.detect(tis, filename);
		return MediaType.parse(type);
	}
}
 
Example 14
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example 15
Source File: EmbeddedExtractor.java    From kite with Apache License 2.0 5 votes vote down vote up
public boolean parseEmbedded(InputStream stream, Record record, String name, Command child) {
    // Use the delegate parser to parse this entry
    
    TemporaryResources tmp = new TemporaryResources();
    try {
      final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
      if (stream instanceof TikaInputStream) {
        final Object container = ((TikaInputStream) stream).getOpenContainer();
        if (container != null) {
          newStream.setOpenContainer(container);
        }
      }
      record = record.copy();

      record.replaceValues(Fields.ATTACHMENT_BODY, newStream);
      record.removeAll(Fields.ATTACHMENT_MIME_TYPE);
      record.removeAll(Fields.ATTACHMENT_CHARSET);
      
      record.removeAll(Fields.ATTACHMENT_NAME);
      if (name != null && name.length() > 0) {
        record.put(Fields.ATTACHMENT_NAME, name);
      }
      
      return child.process(record);
//    } catch (RuntimeException e) {
//      
//      // THIS IS THE DIFF WRT ParsingEmbeddedDocumentExtractor
//      throw new MorphlineRuntimeException(e);
//      
//        // TODO: can we log a warning somehow?
//        // Could not parse the entry, just skip the content
    } finally {
      Closeables.closeQuietly(tmp);
    }

  }
 
Example 16
Source File: TikaFilePlace.java    From emissary with Apache License 2.0 5 votes vote down vote up
/**
 * Use the Tika mime type (magic) detector to identify the file type
 *
 * @param d the IBaseDataObject payload to evaluate
 * @return mediaType
 */
private MediaType detectType(IBaseDataObject d) throws Exception {
    Metadata metadata = new Metadata();
    InputStream input = TikaInputStream.get(d.data(), metadata);
    appendFilenameMimeTypeSupport(d, metadata);
    MediaType mediaType = mimeTypes.detect(input, metadata);
    logger.debug("Tika type: " + mediaType.toString());
    return mediaType;
}
 
Example 17
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}
 
Example 18
Source File: TesseractOCRParser.java    From CogStack-Pipeline with Apache License 2.0 4 votes vote down vote up
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
    TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);

    // If Tesseract is not on the path with the current config, do not try to run OCR
    // getSupportedTypes shouldn't have listed us as handling it, so this should only
    //  occur if someone directly calls this parser, not via DefaultParser or similar
    if (! hasTesseract(config))
        return;

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

    TemporaryResources tmp = new TemporaryResources();
    File output = null;
    try {
        TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
        File input = tikaStream.getFile();
        long size = tikaStream.getLength();

        if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {

            output = tmp.createTemporaryFile();
            doOCR(input, output, config);

            // Tesseract appends .txt to output file name
            output = new File(output.getAbsolutePath() + ".txt");

            if (output.exists())
                extractOutput(new FileInputStream(output), xhtml);

        }

        // Temporary workaround for TIKA-1445 - until we can specify
        //  composite parsers with strategies (eg Composite, Try In Turn),
        //  always send the image onwards to the regular parser to have
        //  the metadata for them extracted as well
        _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
    } finally {
        tmp.dispose();
        if (output != null) {
            output.delete();
        }
    }
}
 
Example 19
Source File: Extractor.java    From extract with MIT License 4 votes vote down vote up
/**
 * Create a pull-parser from the given {@link TikaInputStream}.
 *
 * @param path the stream to extract from
 * @return A pull-parsing reader.
 */
public TikaDocument extract(final Path path) throws IOException {
	final TikaDocument rootDocument = documentFactory.create(path);
	TikaInputStream tikaInputStream = TikaInputStream.get(path, rootDocument.getMetadata());
	final ParseContext context = new ParseContext();
	final AutoDetectParser autoDetectParser = new AutoDetectParser(defaultParser);

	// Set a fallback parser that outputs an empty tikaDocument for empty files,
	// otherwise throws an exception.
	autoDetectParser.setFallback(FallbackParser.INSTANCE);
	final Parser parser;

	if (null != digester) {
		parser = new DigestingParser(autoDetectParser, digester);
	} else {
		parser = autoDetectParser;
	}

	if (!ocrDisabled) {
		context.set(TesseractOCRConfig.class, ocrConfig);
	}

	context.set(PDFParserConfig.class, pdfConfig);

	// Only include "safe" tags in the HTML output from Tika's HTML parser.
	// This excludes script tags and objects.
	context.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);

	final Reader reader;
	final Function<Writer, ContentHandler> handler;

	if (OutputFormat.HTML == outputFormat) {
		handler = (writer) -> new ExpandedTitleContentHandler(new HTML5Serializer(writer));
	} else {

		// The default BodyContentHandler is used when constructing the ParsingReader for text output, but
		// because only the body of embeds is pushed to the content handler further down the line, we can't
		// expect a body tag.
		handler = WriteOutContentHandler::new;
	}

	if (EmbedHandling.SPAWN == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedSpawner(rootDocument, context, embedOutput, handler));
	} else if (EmbedHandling.CONCATENATE == embedHandling) {
		context.set(Parser.class, parser);
		context.set(EmbeddedDocumentExtractor.class, new EmbedParser(rootDocument, context));
	} else {
		context.set(Parser.class, EmptyParser.INSTANCE);
		context.set(EmbeddedDocumentExtractor.class, new EmbedBlocker());
	}

	// the constructor of ParsingReader actually parses the document in background
	if (OutputFormat.HTML == outputFormat) {
		reader = new ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context, handler);
	} else {
		reader = new org.apache.tika.parser.ParsingReader(parser, tikaInputStream, rootDocument.getMetadata(), context);
	}
	rootDocument.setReader(reader);

	return rootDocument;
}
 
Example 20
Source File: ExtractMediaMetadata.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                                       Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);

    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }

        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}