org.apache.tika.metadata.TikaMetadataKeys Java Examples

The following examples show how to use org.apache.tika.metadata.TikaMetadataKeys. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example #2
Source File: ExecUtil.java    From ctsms with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static String getMimeType(File file) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, file.getName());
	try {
		tikaStream = TikaInputStream.get(file, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example #3
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}
 
Example #4
Source File: AbstractFessFileTransformer.java    From fess with Apache License 2.0 6 votes vote down vote up
protected Map<String, String> createExtractParams(final ResponseData responseData, final CrawlingConfig crawlingConfig) {
    final Map<String, String> params = new HashMap<>(crawlingConfig.getConfigParameterMap(ConfigName.CONFIG));
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    params.put(ExtractData.URL, responseData.getUrl());
    final Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
    if (configParam != null) {
        final String keepOriginalBody = configParam.get(Config.KEEP_ORIGINAL_BODY);
        if (StringUtil.isNotBlank(keepOriginalBody)) {
            params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE
                    : Constants.TRUE);
        }
    }
    return params;
}
 
Example #5
Source File: RegexRulesPasswordProvider.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public String getPassword(Metadata meta) {
  if(getExplicitPassword() != null) {
    return getExplicitPassword();
  }
  
  if(passwordMap.size() > 0)
    return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
  
  return null;
}
 
Example #6
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}
 
Example #7
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
public static String detectContentTypeAndCharset(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	// Use metadata to provide type-hinting to the AutoDetectReader.
	fillMetadata(metadata, contentType, uri);

	// Detect the character set.
	final AutoDetectReader reader = new AutoDetectReader(inputStream, metadata);
	String charset = reader.getCharset().toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	// Append the charset if the content-type was determined.
	if (charset != null && !charset.isEmpty()) {
		return contentType + "; charset=" + charset;
	}

	return contentType;
}
 
Example #8
Source File: IdentifyMimeType.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }

    final ComponentLog logger = getLogger();
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());

    session.read(flowFile, new InputStreamCallback() {
        @Override
        public void process(final InputStream stream) throws IOException {
            try (final InputStream in = new BufferedInputStream(stream)) {
                TikaInputStream tikaStream = TikaInputStream.get(in);
                Metadata metadata = new Metadata();
                // Add filename if it exists
                if (filename != null) {
                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                }
                // Get mime type
                MediaType mediatype = detector.detect(tikaStream, metadata);
                mimeTypeRef.set(mediatype.toString());
            }
        }
    });

    String mimeType = mimeTypeRef.get();
    String extension = "";
    try {
        MimeType mimetype;
        mimetype = config.getMimeRepository().forName(mimeType);
        extension = mimetype.getExtension();
    } catch (MimeTypeException ex) {
        logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
    }

    // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
    if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
        extension = ".gz";
    }

    if (mimeType == null) {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
        flowFile = session.putAttribute(flowFile, "mime.extension", "");
        logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
    } else {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
        flowFile = session.putAttribute(flowFile, "mime.extension", extension);
        logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
    }

    session.getProvenanceReporter().modifyAttributes(flowFile);
    session.transfer(flowFile, REL_SUCCESS);
}
 
Example #9
Source File: IdentifyMimeType.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }

    final ComponentLog logger = getLogger();
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());

    session.read(flowFile, new InputStreamCallback() {
        @Override
        public void process(final InputStream stream) throws IOException {
            try (final InputStream in = new BufferedInputStream(stream)) {
                TikaInputStream tikaStream = TikaInputStream.get(in);
                Metadata metadata = new Metadata();

                if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                }
                // Get mime type
                MediaType mediatype = detector.detect(tikaStream, metadata);
                mimeTypeRef.set(mediatype.toString());
            }
        }
    });

    String mimeType = mimeTypeRef.get();
    String extension = "";
    try {
        MimeType mimetype;
        mimetype = mimeTypes.forName(mimeType);
        extension = mimetype.getExtension();
    } catch (MimeTypeException ex) {
        logger.warn("MIME type extension lookup failed: {}", new Object[]{ex});
    }

    // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
    if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
        extension = ".gz";
    }

    if (mimeType == null) {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
        flowFile = session.putAttribute(flowFile, "mime.extension", "");
        logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[]{flowFile});
    } else {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
        flowFile = session.putAttribute(flowFile, "mime.extension", extension);
        logger.info("Identified {} as having MIME Type {}", new Object[]{flowFile, mimeType});
    }

    session.getProvenanceReporter().modifyAttributes(flowFile);
    session.transfer(flowFile, REL_SUCCESS);
}