Java Code Examples for org.apache.tika.metadata.Metadata#get()

The following examples show how to use org.apache.tika.metadata.Metadata#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Generates the release date
 */
private Date generateReleaseDate(Metadata metadata)
{
   String date = metadata.get(XMPDM.RELEASE_DATE);
   if(date == null || date.length() == 0)
   {
      return null;
   }
      
   // Is it just a year?
   if(date.matches("\\d\\d\\d\\d"))
   {
      // Just a year, we need a full date
      // Go for the 1st of the 1st
      Calendar c = Calendar.getInstance();
      c.set(
            Integer.parseInt(date), Calendar.JANUARY, 1,
            0, 0, 0
      );
      c.set(Calendar.MILLISECOND, 0);
      return c.getTime();
   }
   
   // Treat as a normal date
   return makeDate(date);
}
 
Example 2
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example 3
Source File: TikaFormat.java    From gate-core with GNU Lesser General Public License v3.0 6 votes vote down vote up
private void setTikaFeature(Metadata metadata, Property property, FeatureMap fmap) {
  String value = metadata.get(property);
  if (value == null) {
    return;
  }

  value = value.trim();
  if (value.length() == 0) {
    return;
  }
  String key = property.getName().toUpperCase();
  if (fmap.containsKey(key)) {
    fmap.put("TIKA_" + key, value);
  }
  else {
    fmap.put(key, value);
    fmap.put("TIKA_" + key, value);
  }		
}
 
Example 4
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Because some editors use JPEG_IMAGE_HEIGHT_TAG when
 * saving JPEG images , a more reliable source for
 * image size are the values provided by Tika
 * and not the exif/tiff metadata read from the file
 * This will override the tiff:Image size 
 * which gets embedded into the alfresco node properties
 * for jpeg files that contain such exif information
 */
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
    if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE)))
    {
        //check if the image has exif information
        if(metadata.get(EXIF_IMAGE_WIDTH_TAG) != null
                && metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null
                && metadata.get(COMPRESSION_TAG) != null)
        {
            //replace the exif size properties that will be embedded in the node with
            //the guessed dimensions from Tika
            putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties);
            putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties);
            putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties);
            putRawValue(JPEG_IMAGE_WIDTH_TAG, metadata.get(EXIF_IMAGE_WIDTH_TAG), properties);
        }
    }
    return properties;
}
 
Example 5
Source File: TikaPoweredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String getMetadataValue(Metadata metadata, String key)
{
    if (metadata.isMultiValued(key))
    {
        String[] parts = metadata.getValues(key);
        
        // use Set to prevent duplicates
        Set<String> value = new LinkedHashSet<String>(parts.length);
        
        for (int i = 0; i < parts.length; i++)
        {
            value.add(parts[i]);
        }
        
        String valueStr = value.toString();
        
        // remove leading/trailing braces []
        return valueStr.substring(1, valueStr.length() - 1);
    }
    else
    {
        return metadata.get(key);
    }
}
 
Example 6
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Generate the description
 * 
 * @param metadata     the metadata extracted from the file
 * @return          the description
 */
@SuppressWarnings("deprecation")
private String generateDescription(Metadata metadata)
{
    StringBuilder result = new StringBuilder();
    if (metadata.get(Metadata.TITLE) != null)
    {
        result.append(metadata.get(Metadata.TITLE));
        if (metadata.get(XMPDM.ALBUM) != null)
        {
           result
            .append(" - ")
            .append(metadata.get(XMPDM.ALBUM));
        }
        if (metadata.get(XMPDM.ARTIST) != null)
        {
           result
            .append(" (")
            .append(metadata.get(XMPDM.ARTIST))
            .append(")");
        }
    }
    
    return result.toString();
}
 
Example 7
Source File: MediaTypeDisablingDocumentSelector.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public boolean select(Metadata metadata)
{
    String contentType = metadata.get(Metadata.CONTENT_TYPE);
    if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
    {
        return true;
    }
    return !disabledMediaTypes.contains(contentType);
}
 
Example 8
Source File: OpenDocumentMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
    @Override
    protected Map<String, Serializable> extractSpecific(Metadata metadata,
         Map<String, Serializable> properties, Map<String, String> headers) 
    {
       putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties);
       putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties);
       putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties);
       putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties);
       putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
       putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
       putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
       putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties);
//     putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties);
//     putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties);
           
       // Handle user-defined properties dynamically
       Map<String, Set<QName>> mapping = super.getMapping();
       for (String key : mapping.keySet())
       {
           if (metadata.get(CUSTOM_PREFIX + key) != null)
           {
                putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties);
           }
       }
       
       return properties;
    }
 
Example 9
Source File: TikaDocumentItemProcessor.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
private String extractContentTypeMetadata(Document doc, Set<String> metaKeys,
                                          Metadata metadata) {
    if (metaKeys.contains("Content-Type")) {
        doc.getAssociativeArray().put("X-TL-CONTENT-TYPE",
            metadata.get("Content-Type"));
        return metadata.get("Content-Type");
    } else {
        doc.getAssociativeArray().put("X-TL-CONTENT-TYPE",
            "TL_CONTENT_TYPE_UNKNOWN");
        return "TL_CONTENT_TYPE_UNKNOWN";
    }
}
 
Example 10
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
/**
 * See if a language was set by the parser, from meta tags. As a last resort falls back to the result from the
 * ProfilingHandler.
 * 
 * @param metadata
 * @param profilingHandler
 * @return The first language found (two char lang code) or empty string if no language was detected.
 */
private static String detectLanguage(Metadata metadata, ProfilingHandler profilingHandler) {
    String result = null;

    String dubCoreLang = metadata.get(TikaCoreProperties.LANGUAGE);
    String httpEquivLang = metadata.get(Metadata.CONTENT_LANGUAGE);

    if (dubCoreLang != null) {
        result = dubCoreLang;
    } else if (httpEquivLang != null) {
        result = httpEquivLang;
    }

    result = getFirstLanguage(result);

    if (result == null) {
        // Language is still unspecified, so use ProfileHandler's result
        LanguageIdentifier langIdentifier = profilingHandler.getLanguage();
        // FUTURE KKr - provide config for specifying required certainty level.
        if (langIdentifier.isReasonablyCertain()) {
            result = langIdentifier.getLanguage();
            LOGGER.trace("Using language specified by profiling handler: " + result);
        } else {
            result = "";
        }

    }

    return result;
}
 
Example 11
Source File: TikaEntityProcessor.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
  if (spatialMetadataField == null) return;
  String latString = metadata.get(Metadata.LATITUDE);
  String lonString = metadata.get(Metadata.LONGITUDE);
  if (latString != null && lonString != null) {
    row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
  }
}
 
Example 12
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String assembleExtractionResult(String bucket, String key, String extractedText, Metadata tikaMetadata) {

      JSONObject extractJson = new JSONObject();

      String contentType = tikaMetadata.get("Content-Type");
      contentType = contentType != null ? contentType : "content/unknown";

      String contentLength = tikaMetadata.get("Content-Length");
      contentLength = contentLength != null ? contentLength : "0";

      extractJson.put("Exception", null);
      extractJson.put("FilePath", "s3://" + bucket + "/" + key);
      extractJson.put("Text", extractedText);
      extractJson.put("ContentType", contentType);
      extractJson.put("ContentLength", contentLength);

      JSONObject metadataJson = new JSONObject();

      for( String name : tikaMetadata.names() ){
        String[] elements = tikaMetadata.getValues(name);
        String joined = String.join(", ", elements);
        metadataJson.put(name, joined);
      }

      extractJson.put("Metadata", metadataJson);

      return extractJson.toJSONString();
    }
 
Example 13
Source File: FallbackParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context) throws SAXException, IOException, TikaException {
	final Parser parser;
	final long size;
	String value = metadata.get(Metadata.CONTENT_LENGTH);

	if (null != value && !value.isEmpty()) {
		size = Long.valueOf(value);
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(stream)) {
			size = tis.getLength();
		}

		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	}

	// If the file is not empty, throw a parse error.
	// Otherwise, output an empty document.
	if (size > 0) {
		parser = ErrorParser.INSTANCE;
	} else {
		metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
		parser = EmptyParser.INSTANCE;
	}

	parser.parse(stream, handler, metadata, context);
}
 
Example 14
Source File: EmbedParser.java    From extract with MIT License 5 votes vote down vote up
void writeStart(final ContentHandler handler, final Metadata metadata) throws SAXException {
	final AttributesImpl attributes = new AttributesImpl();
	final String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

	attributes.addAttribute("", "class", "class", "CDATA", "package-entry");
	handler.startElement(XHTML, "div", "div", attributes);

	if (name != null && name.length() > 0) {
		handler.startElement(XHTML, "h1", "h1", new AttributesImpl());
		char[] chars = name.toCharArray();
		handler.characters(chars, 0, chars.length);
		handler.endElement(XHTML, "h1", "h1");
	}
}
 
Example 15
Source File: EmbedSpawner.java    From extract with MIT License 5 votes vote down vote up
private void writeEmbed(final TikaInputStream tis, final EmbeddedTikaDocument embed, final String name) throws IOException {
	final Path destination = outputPath.resolve(embed.getHash());
	final Path source;

	final Metadata metadata = embed.getMetadata();
	final Object container = tis.getOpenContainer();

	// If the input is a container, write it to a temporary file so that it can then be copied atomically.
	// This happens with, for example, an Outlook Message that is an attachment of another Outlook Message.
	if (container instanceof DirectoryEntry) {
		try (final TemporaryResources tmp = new TemporaryResources();
		     final POIFSFileSystem fs = new POIFSFileSystem()) {
			source = tmp.createTempFile();
			saveEntries((DirectoryEntry) container, fs.getRoot());

			try (final OutputStream output = Files.newOutputStream(source)) {
				fs.writeFilesystem(output);
			}
		}
	} else {
		source = tis.getPath();
	}

	// Set the content-length as it isn't (always?) set by Tika for embeds.
	if (null == metadata.get(Metadata.CONTENT_LENGTH)) {
		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source)));
	}

	// To prevent massive duplication and because the disk is only a storage for underlying data, save using the
	// straight hash as a filename.
	try {
		Files.copy(source, destination);
	} catch (final FileAlreadyExistsException e) {
		if (Files.size(source) != Files.size(destination)) {
			Files.copy(source, destination, StandardCopyOption.REPLACE_EXISTING);
		} else {
			logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root);
		}
	}
}
 
Example 16
Source File: DigestIdentifier.java    From extract with MIT License 5 votes vote down vote up
@Override
public String generateForEmbed(final EmbeddedTikaDocument embed) throws NoSuchAlgorithmException {
	final MessageDigest digest = MessageDigest.getInstance(algorithm);

	// Embedded documents in different files or the same file could have the same hash. Therefore, to avoid ID
	// collisions within the child document tree, the digest considers:
	// - the file digest hash
	// - the parent path
	// - the embedded relationship ID
	// - the embedded document name
	final Metadata metadata = embed.getMetadata();
	final String embeddedRelationshipId = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
	final String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
	final String hash = hash(embed);

	if (null == hash) {
		throw new IllegalStateException(String.format("No hash is available for the document with name \"%s\" at " +
						"path \"%s\".", name, embed.getPath()));
	}

	digest.update(hash.getBytes(charset));
	digest.update(embed.getParent().getId().getBytes(charset));

	if (null != embeddedRelationshipId) {
		digest.update(embeddedRelationshipId.getBytes(charset));
	}

	if (null != name) {
		digest.update(name.getBytes(charset));
	}

	return DatatypeConverter.printHexBinary(digest.digest()).toLowerCase(ENGLISH);
}
 
Example 17
Source File: DataURIEncodingInputStream.java    From extract with MIT License 5 votes vote down vote up
private static Charset detectCharset(final Path path, final Metadata metadata) throws IOException {
	final Charset charset;

	// Try to parse the character set from the content-encoding.
	String orig = metadata.get(Metadata.CONTENT_ENCODING);

	// Try to detect the character set.
	if (null != orig && Charset.isSupported(orig)) {
		return Charset.forName(orig);
	}

	try (
		final InputStream input = new BufferedInputStream(Files.newInputStream(path));
		final AutoDetectReader detector = new AutoDetectReader(input, metadata)
	) {
		charset = detector.getCharset();
	} catch (TikaException e) {
		throw new IOException("Unable to detect charset.", e);
	}

	return charset;
}
 
Example 18
Source File: UniversalEncodingListener.java    From onedev with MIT License 5 votes vote down vote up
public UniversalEncodingListener(Metadata metadata) {
    MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
    if (type != null) {
        hint = type.getParameters().get("charset");
    }
    if (hint == null) {
        hint = metadata.get(Metadata.CONTENT_ENCODING);
    }
}
 
Example 19
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}
 
Example 20
Source File: AbstractIdentifier.java    From extract with MIT License 4 votes vote down vote up
@Override
public String retrieveHash(final Metadata metadata) {
	return metadata.get(key);
}