Java Code Examples for org.apache.tika.metadata.Metadata#set()

The following examples show how to use org.apache.tika.metadata.Metadata#set() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReplayCrawl.java    From ache with Apache License 2.0 6 votes vote down vote up
private Metadata createHeadersMetadata(Page page) {
    Map<String, List<String>> headers = page.getResponseHeaders();
    Metadata metadata = new Metadata();
    for (Entry<String, List<String>> header : headers.entrySet()) {
        for (String value : header.getValue()) {
            metadata.set(header.getKey(), value);
        }
    }
    return metadata;
}
 
Example 2
Source File: UpdatableInputStreamDigester.java    From extract with MIT License 6 votes vote down vote up
/**
 * @param is       input stream to read from
 * @param metadata metadata for reporting the digest
 * @return whether or not this finished the input stream
 * @throws IOException
 */
private boolean digestStream(InputStream is, Metadata metadata) throws IOException {
    byte[] digestBytes;
    MessageDigest messageDigest = newMessageDigest();

    updateDigest(messageDigest, new ByteArrayInputStream(getDigestUpdateModifier().getBytes()));
    updateDigest(messageDigest, is);
    digestBytes = messageDigest.digest();

    if (is instanceof SimpleBoundedInputStream) {
        if (((SimpleBoundedInputStream) is).hasHitBound()) {
            return false;
        }
    }
    metadata.set(getMetadataKey(), encoder.encode(digestBytes));
    return true;
}
 
Example 3
Source File: SpewerTest.java    From extract with MIT License 6 votes vote down vote up
@Test
public void testWritesISO8601Dates() throws IOException {
	final SpewerStub spewer = new SpewerStub();
	final TikaDocument tikaDocument = factory.create("test.txt");
	final Metadata metadata = tikaDocument.getMetadata();
	final FieldNames fields = spewer.getFields();

	// TODO: this should go in a separate test for the MetadataTransformer.
	final String[] dates = {"2011-12-03+01:00", "2015-06-03", "Tue Jan 27 17:03:21 2004", "19106-06-07T08:00:00Z"};
	final String[] isoDates = {"2011-12-03T12:00:00Z", "2015-06-03T12:00:00Z", "2004-01-27T17:03:21Z",
			"+19106-06-07T08:00:00Z"};
	int i = 0;

	for (String date: dates) {
		metadata.set(Office.CREATION_DATE, date);
		spewer.writeMetadata(tikaDocument);

		Assert.assertEquals(date, spewer.metadata.get(fields.forMetadata(Office.CREATION_DATE.getName())));
		Assert.assertEquals(isoDates[i++],
				spewer.metadata.get(fields.forMetadataISODate(Office.CREATION_DATE.getName())));

		// Reset the store of written metadata on each iteration.
		spewer.close();
	}
}
 
Example 4
Source File: DefaultResultsVisitor.java    From allure2 with Apache License 2.0 6 votes vote down vote up
public static String probeContentType(final InputStream is, final String name) {
    try (InputStream stream = new BufferedInputStream(is)) {
        final Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
        return getDefaultMimeTypes().detect(stream, metadata).toString();
    } catch (IOException e) {
        LOGGER.warn("Couldn't detect the media type of attachment {} {}", name, e);
        return WILDCARD;
    }
}
 
Example 5
Source File: ImageConverter.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}
 
Example 6
Source File: TransportAmazonLambdaS3.java    From github-bucket with ISC License 5 votes vote down vote up
@Override
void writeFile(final String path, final byte[] data) throws IOException {
    ObjectMetadata bucketMetadata = new ObjectMetadata();
    bucketMetadata.setContentMD5(Md5Utils.md5AsBase64(data));
    bucketMetadata.setContentLength(data.length);
    // Give Tika a few hints for the content detection
    Metadata tikaMetadata = new Metadata();
    tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
    // Fire!
    try (InputStream bis = TikaInputStream.get(data, tikaMetadata)) {
        bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
        s3.putObject(bucket, resolveKey(path), bis, bucketMetadata);
    }
}
 
Example 7
Source File: RepositoryS3.java    From github-bucket with ISC License 5 votes vote down vote up
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}
 
Example 8
Source File: RepositoryS3.java    From github-bucket with ISC License 5 votes vote down vote up
private boolean walk(Iterator<S3ObjectSummary> iter, ObjectId file, String path) throws IOException {
    byte[] content;
    byte[] newHash;
    LOG.debug("Start processing file: {}", path);
    try (DigestInputStream is = new DigestInputStream(repository.open(file).openStream(), DigestUtils.getMd5Digest())) {
        // Get content
        content = IOUtils.toByteArray(is);
        // Get hash
        newHash = is.getMessageDigest().digest();
    }
    if (isUploadFile(iter, path, Hex.encodeHexString(newHash))) {
        LOG.info("Uploading file: {}", path);
        ObjectMetadata bucketMetadata = new ObjectMetadata();
        bucketMetadata.setContentMD5(Base64.encodeAsString(newHash));
        bucketMetadata.setContentLength(content.length);
        // Give Tika a few hints for the content detection
        Metadata tikaMetadata = new Metadata();
        tikaMetadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(FilenameUtils.normalize(path)));
        // Fire!
        try (InputStream bis = TikaInputStream.get(content, tikaMetadata)) {
            bucketMetadata.setContentType(TIKA_DETECTOR.detect(bis, tikaMetadata).toString());
            s3.putObject(bucket.getName(), path, bis, bucketMetadata);
            return true;
        }
    }
    LOG.info("Skipping file (same checksum): {}", path);
    return false;
}
 
Example 9
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Sets the input metadata for {@link Parser#parse}. */
public ParseFiles withInputMetadata(Metadata metadata) {
  Metadata inputMetadata = this.getInputMetadata();
  if (inputMetadata != null) {
    for (String name : metadata.names()) {
      inputMetadata.set(name, metadata.get(name));
    }
  } else {
    inputMetadata = metadata;
  }
  return toBuilder().setInputMetadata(inputMetadata).build();
}
 
Example 10
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}
 
Example 11
Source File: TikaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  ParseResult result = c.element();
  Metadata m = new Metadata();
  // Files contain many metadata properties. This function drops all but the "Author"
  // property manually added to "apache-beam-tika.odt" resource only to make
  // the tests simpler
  if (result.getFileLocation().endsWith("valid/apache-beam-tika.odt")) {
    m.set("Author", result.getMetadata().get("Author"));
  }
  ParseResult newResult = ParseResult.success(result.getFileLocation(), result.getContent(), m);
  c.output(newResult);
}
 
Example 12
Source File: DirectoryScanner.java    From importer-exporter with Apache License 2.0 5 votes vote down vote up
private MediaType getMediaType(Path file) {
    try (InputStream stream = TikaInputStream.get(file)) {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, file.toString());
        return tikaConfig.getDetector().detect(stream, metadata);
    } catch (IOException e) {
        return MediaType.EMPTY;
    }
}
 
Example 13
Source File: FallbackParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parse(final InputStream stream, final ContentHandler handler, final Metadata metadata,
                  final ParseContext context) throws SAXException, IOException, TikaException {
	final Parser parser;
	final long size;
	String value = metadata.get(Metadata.CONTENT_LENGTH);

	if (null != value && !value.isEmpty()) {
		size = Long.valueOf(value);
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(stream)) {
			size = tis.getLength();
		}

		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	}

	// If the file is not empty, throw a parse error.
	// Otherwise, output an empty document.
	if (size > 0) {
		parser = ErrorParser.INSTANCE;
	} else {
		metadata.set(Metadata.CONTENT_TYPE, "application/octet-stream");
		parser = EmptyParser.INSTANCE;
	}

	parser.parse(stream, handler, metadata, context);
}
 
Example 14
Source File: TikaFilePlace.java    From emissary with Apache License 2.0 5 votes vote down vote up
/**
 * Use filename to support the mime type detection, if not disabled in TikaFilePlace.cfg
 *
 * @param d the IBaseDataObject payload to evaluate
 * @param metadata from the file, for Tika to process
 */
private void appendFilenameMimeTypeSupport(IBaseDataObject d, Metadata metadata) {
    if (includeFilenameMimeType) {
        logger.debug("Filename support for Mime Type detection is enabled");
        metadata.set(Metadata.RESOURCE_NAME_KEY, d.getFilename());
    }
}
 
Example 15
Source File: TikaAutoMetadataExtracterTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Test MNT-15219 Excel (.xlsx) containing xmls (shapes/drawings) with multi byte characters may
 * cause OutOfMemory in Tika Note - doesn't use extractFromMimetype
 */
public void testParsingOfShapesInXLSXFiles() throws Exception
{
    AutoDetectParser ap = new AutoDetectParser();

    String filename = "dmsu1332-reproduced.xlsx";
    URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
    File file = new File(url.getFile());

    // Cheat and ask Tika for the mime type!
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
    MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
    String mimetype = mt.toString();

    if (logger.isDebugEnabled())
    {
        logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
    }

    // Have it processed
    // see MNT-15219 and REPO-3251
    Map<QName, Serializable> properties = extractFromFile(file, mimetype);

    // check we got something
    assertFalse("extractFromMimetype should return at least some properties, none found for " + mimetype + " - " + filename, 
            properties.isEmpty());

    if (properties.containsKey(ContentModel.PROP_AUTHOR))
    {
        assertEquals("Property " + ContentModel.PROP_AUTHOR + " not found for mimetype " + mimetype, 
                "Udintsev, Anton (external - Project)",
                DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(ContentModel.PROP_AUTHOR)));
    }
    else
    {
        fail("Expected one property out of " + ContentModel.PROP_CREATOR + " and " + ContentModel.PROP_AUTHOR + " but found neither of them for "
                + mimetype);
    }

    // Ensure that we can also get things which are standard
    // Tika metadata properties, if we so choose to
    assertTrue("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " not found for mimetype " + mimetype, 
            properties.containsKey(TIKA_MIMETYPE_TEST_PROPERTY));
    assertEquals("Test Property " + TIKA_MIMETYPE_TEST_PROPERTY + " incorrect for mimetype " + mimetype, 
            mimetype,
            DefaultTypeConverter.INSTANCE.convert(String.class, properties.get(TIKA_MIMETYPE_TEST_PROPERTY)));
}
 
Example 16
Source File: TikaAutoMetadataExtracterTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Test several different files
 * Note - doesn't use extractFromMimetype
 */
public void testSupportedMimetypes() throws Exception
{
    String[] testFiles = new String[] {
          ".doc", ".docx", ".xls", ".xlsx",
          ".ppt", ".pptx", 
          //".vsd", // Our sample file lacks suitable metadata
          "2010.dwg",
          "2003.mpp", "2007.mpp",
          ".pdf",
          ".odt",
    };
       
    AutoDetectParser ap = new AutoDetectParser();
    for (String fileBase : testFiles)
    {
       String filename = "quick" + fileBase;
       URL url = AbstractContentTransformerTest.class.getClassLoader().getResource("quick/" + filename);
       File file = new File(url.getFile());
       
       // Cheat and ask Tika for the mime type!
       Metadata metadata = new Metadata();
       metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
       MediaType mt = ap.getDetector().detect(TikaInputStream.get(file), metadata);
       String mimetype = mt.toString();
       
       if (logger.isDebugEnabled())
       {
          logger.debug("Detected mimetype " + mimetype + " for quick test file " + filename);
       }

       // Have it processed
       Map<QName, Serializable> properties = extractFromFile(file, mimetype);
       
       // check we got something
       assertFalse("extractFromMimetype should return at least some properties, " +
       		"none found for " + mimetype + " - " + filename,
          properties.isEmpty());
       
       // check common metadata
       testCommonMetadata(mimetype, properties);
       // check file-type specific metadata
       testFileSpecificMetadata(mimetype, properties);
    }
}
 
Example 17
Source File: HTMLRenderingEngine.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Asks Tika to translate the contents into HTML
 */
private void generateHTML(Parser p, RenderingContext context)
{
   ContentReader contentReader = context.makeContentReader();
   
   // Setup things to parse with
   StringWriter sw = new StringWriter();
   ContentHandler handler = buildContentHandler(sw, context);
   
   // Tell Tika what we're dealing with
   Metadata metadata = new Metadata();
   metadata.set(
         Metadata.CONTENT_TYPE, 
         contentReader.getMimetype()
   );
   metadata.set(
         Metadata.RESOURCE_NAME_KEY, 
         nodeService.getProperty( 
               context.getSourceNode(),
               ContentModel.PROP_NAME
         ).toString()
   );

   // Our parse context needs to extract images
   ParseContext parseContext = new ParseContext();
   parseContext.set(Parser.class, new TikaImageExtractingParser(context));
   
   // Parse
   try {
      p.parse(
            contentReader.getContentInputStream(),
            handler, metadata, parseContext
      );
   } catch(Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
   }
   
   // As a string
   String html = sw.toString();
   
   // If we're doing body-only, remove all the html namespaces
   //  that will otherwise clutter up the document
   boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
   if(bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
      html = html.replaceAll("&#13;","");
   }
   
   // Save it
   ContentWriter contentWriter = context.makeContentWriter();
   contentWriter.setMimetype("text/html");
   contentWriter.putContent( html );
}
 
Example 18
Source File: DocumentFactory.java    From extract with MIT License 4 votes vote down vote up
public TikaDocument create(final String id, final Path path, final long size) {
	final Metadata metadata = new Metadata();

	metadata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
	return new TikaDocument(id, identifier, path, metadata);
}
 
Example 19
Source File: TikaIOTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private static Metadata getOdtMetadata() {
  Metadata m = new Metadata();
  m.set("Author", "BeamTikaUser");
  return m;
}
 
Example 20
Source File: TikaTest.java    From tika-server with Apache License 2.0 4 votes vote down vote up
protected XMLResult getXML(String filePath, Parser parser) throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, filePath);
    return getXML(filePath, parser, metadata);
}