org.apache.tika.metadata.TikaCoreProperties Java Examples

The following examples show how to use org.apache.tika.metadata.TikaCoreProperties. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EmbedSpawner.java    From extract with MIT License 6 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
Example #2
Source File: TikaFormat.java    From gate-core with GNU Lesser General Public License v3.0 5 votes vote down vote up
private void setDocumentFeatures(Metadata metadata, Document doc) {
  FeatureMap fmap = doc.getFeatures();
  setTikaFeature(metadata, TikaCoreProperties.TITLE, fmap);
  setTikaFeature(metadata, Office.AUTHOR, fmap);
  setTikaFeature(metadata, TikaCoreProperties.COMMENTS, fmap);
  setTikaFeature(metadata, TikaCoreProperties.CREATOR, fmap);
  if (fmap.get("AUTHORS") == null && fmap.get("AUTHOR") != null)
    fmap.put("AUTHORS", fmap.get(Office.AUTHOR));
  fmap.put("MimeType", metadata.get(Metadata.CONTENT_TYPE));
}
 
Example #3
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
/**
 * See if a language was set by the parser, from meta tags. As a last resort falls back to the result from the
 * ProfilingHandler.
 * 
 * @param metadata
 * @param profilingHandler
 * @return The first language found (two char lang code) or empty string if no language was detected.
 */
private static String detectLanguage(Metadata metadata, ProfilingHandler profilingHandler) {
    String result = null;

    String dubCoreLang = metadata.get(TikaCoreProperties.LANGUAGE);
    String httpEquivLang = metadata.get(Metadata.CONTENT_LANGUAGE);

    if (dubCoreLang != null) {
        result = dubCoreLang;
    } else if (httpEquivLang != null) {
        result = httpEquivLang;
    }

    result = getFirstLanguage(result);

    if (result == null) {
        // Language is still unspecified, so use ProfileHandler's result
        LanguageIdentifier langIdentifier = profilingHandler.getLanguage();
        // FUTURE KKr - provide config for specifying required certainty level.
        if (langIdentifier.isReasonablyCertain()) {
            result = langIdentifier.getLanguage();
            LOGGER.trace("Using language specified by profiling handler: " + result);
        } else {
            result = "";
        }

    }

    return result;
}
 
Example #4
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}
 
Example #5
Source File: UpdatableInputStreamDigester.java    From extract with MIT License 4 votes vote down vote up
private String getMetadataKey() {
    return TikaCoreProperties.TIKA_META_PREFIX +
            "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER +
            algorithmKeyName;
}
 
Example #6
Source File: Identifier.java    From extract with MIT License 4 votes vote down vote up
static String getKey(String algorithm) {
	return TikaCoreProperties.TIKA_META_PREFIX + "digest" + Metadata.NAMESPACE_PREFIX_DELIMITER + algorithm
			.replace("-", "");
}