org.apache.tika.metadata.HttpHeaders Java Examples

The following examples show how to use org.apache.tika.metadata.HttpHeaders. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AbstractFessFileTransformer.java    From fess with Apache License 2.0 6 votes vote down vote up
protected Map<String, String> createExtractParams(final ResponseData responseData, final CrawlingConfig crawlingConfig) {
    final Map<String, String> params = new HashMap<>(crawlingConfig.getConfigParameterMap(ConfigName.CONFIG));
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    params.put(ExtractData.URL, responseData.getUrl());
    final Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
    if (configParam != null) {
        final String keepOriginalBody = configParam.get(Config.KEEP_ORIGINAL_BODY);
        if (StringUtil.isNotBlank(keepOriginalBody)) {
            params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE
                    : Constants.TRUE);
        }
    }
    return params;
}
 
Example #2
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}
 
Example #3
Source File: TikaParser.java    From quarkus with Apache License 2.0 4 votes vote down vote up
protected TikaContent parseStream(InputStream entityStream, String contentType, ContentHandler tikaHandler)
        throws TikaParseException {
    try {
        ParseContext context = new ParseContext();
        // AutoDetectParser must be set in the context to enable the parsing of the embedded content
        Parser contextParser = this.appendEmbeddedContent ? parser : ((RecursiveParserWrapper) parser).getWrappedParser();
        context.set(Parser.class, contextParser);

        org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata();
        if (contentType != null) {
            tikaMetadata.set(HttpHeaders.CONTENT_TYPE, contentType);
        }

        try (InputStream tikaStream = TikaInputStream.get(entityStream)) {
            parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
            if (this.appendEmbeddedContent) {
                // the embedded content if any has already been appended to the master content
                return new TikaContent(tikaHandler == null ? null : tikaHandler.toString().trim(), convert(tikaMetadata));
            } else {
                RecursiveParserWrapperHandler rHandler = (RecursiveParserWrapperHandler) tikaHandler;

                // The metadata list represents the master and embedded content (text and metadata)
                // The first metadata in the list represents the master (outer) content
                List<org.apache.tika.metadata.Metadata> allMetadata = rHandler.getMetadataList();
                String masterText = allMetadata.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);

                // Embedded (inner) content starts from the index 1.
                List<TikaContent> embeddedContent = new LinkedList<>();
                for (int i = 1; i < allMetadata.size(); i++) {
                    String embeddedText = allMetadata.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
                    // the embedded text can be null if the given document is an image
                    // and no text recognition parser is enabled
                    if (embeddedText != null) {
                        embeddedContent.add(new TikaContent(embeddedText.trim(), convert(allMetadata.get(i))));
                    }
                }
                return new TikaContent(masterText, convert(allMetadata.get(0)), embeddedContent);

            }
        }
    } catch (Exception e) {
        final String errorMessage = "Unable to parse the stream"
                + (contentType == null ? "" : " for content-type: " + contentType);
        throw new TikaParseException(errorMessage, e);
    }
}