org.apache.tika.metadata.Metadata Java Examples

The following examples show how to use org.apache.tika.metadata.Metadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DetectMimeTypeBuilder.java    From kite with Apache License 2.0 7 votes vote down vote up
/**
 * Detects the content type of the given input event. Returns
 * <code>application/octet-stream</code> if the type of the event can not be
 * detected.
 * <p>
 * It is legal for the event headers or body to be empty. The detector may
 * read bytes from the start of the body stream to help in type detection.
 * 
 * @return detected media type, or <code>application/octet-stream</code>
 */
private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
  MediaType mediaType;
  try {
    mediaType = getDetector().detect(in, metadata);
  } catch (IOException e) {
    throw new MorphlineRuntimeException(e);
  }
  String mediaTypeStr = mediaType.toString();
  if (excludeParameters) {
    int i = mediaTypeStr.indexOf(';');
    if (i >= 0) {
      mediaTypeStr = mediaTypeStr.substring(0, i);
    }
  }
  return mediaTypeStr;
}
 
Example #2
Source File: StoredFile.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private void init(String inName, String inExt, InputStream is) {
	if (Strings.isEmpty(inExt)) {
		int idx = inName.lastIndexOf('.');
		name = idx < 0 ? inName : inName.substring(0, idx);
		ext = getFileExt(inName);
	} else {
		name = inName;
		ext = inExt.toLowerCase(Locale.ROOT);
	}
	Metadata md = new Metadata();
	md.add(RESOURCE_NAME_KEY, String.format(FILE_NAME_FMT, name, ext));
	try {
		mime = tika.getDetector().detect(is == null ? null : TikaInputStream.get(is), md);
	} catch (Throwable e) {
		mime = null;
		log.error("Unexpected exception while detecting mime type", e);
	}
}
 
Example #3
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example #4
Source File: TikaDocumentItemProcessor.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
private void extractPageCountMetadata(Document doc, Set<String> metaKeys,
                                      Metadata metadata) {
    if (metaKeys.contains("xmpTPg:NPages")) {
        doc.getAssociativeArray().put("X-TL-PAGE-COUNT",
            metadata.get("xmpTPg:NPages"));

    } else if (metaKeys.contains("Page-Count")) {
        doc.getAssociativeArray().put("X-TL-PAGE-COUNT",
            metadata.get("Page-Count"));

    } else if (metaKeys.contains("meta:page-count")) {
        doc.getAssociativeArray().put("X-TL-PAGE-COUNT",
            metadata.get("meta:page-count"));

    } else {
        doc.getAssociativeArray().put("X-TL-PAGE-COUNT",
            "TL_PAGE_COUNT_UNKNOWN");
    }
}
 
Example #5
Source File: CSVDetector.java    From data-prep with Apache License 2.0 6 votes vote down vote up
/**
 * A private utility class used to detect format.
 *
 * @param metadata the specified TIKA {@link Metadata}
 * @param inputStream the specified input stream
 * @return either null or an CSV format
 * @throws IOException
 */
private Format detectText(Metadata metadata, InputStream inputStream) throws IOException {
    MediaType mediaType = mimeTypes.detect(inputStream, metadata);
    if (mediaType != null) {
        String mediaTypeName = mediaType.toString();

        if (StringUtils.startsWith(mediaTypeName, TEXT_PLAIN)) {
            Charset charset = null;
            try {
                charset = encodingDetector.detect(inputStream, metadata);
            } catch (IOException e) {
                LOGGER.debug("Unable to detect the encoding for a data set in CSV format", e);
            }
            if (charset != null) {
                return new Format(csvFormatFamily, charset.name());
            } else {
                return new Format(csvFormatFamily, FormatUtils.DEFAULT_ENCODING);
            }
        }
    }
    return null;
}
 
Example #6
Source File: ExtractorTest.java    From extract with MIT License 6 votes vote down vote up
@Test
public void testIgnoreEmbeds() throws Throwable {
	final Extractor extractor = new Extractor();

	extractor.setEmbedHandling(Extractor.EmbedHandling.IGNORE);
	Assert.assertEquals(extractor.getEmbedHandling(), Extractor.EmbedHandling.IGNORE);

	TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/ocr/embedded.pdf").getPath()));

	String text;

	try (final Reader reader = tikaDocument.getReader()) {
		text = Spewer.toString(reader);
	}

	Assert.assertEquals("application/pdf", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE));
	Assert.assertEquals("\n\n\n\n", text);
}
 
Example #7
Source File: TikaFormat.java    From gate-core with GNU Lesser General Public License v3.0 6 votes vote down vote up
private void setTikaFeature(Metadata metadata, Property property, FeatureMap fmap) {
  String value = metadata.get(property);
  if (value == null) {
    return;
  }

  value = value.trim();
  if (value.length() == 0) {
    return;
  }
  String key = property.getName().toUpperCase();
  if (fmap.containsKey(key)) {
    fmap.put("TIKA_" + key, value);
  }
  else {
    fmap.put(key, value);
    fmap.put("TIKA_" + key, value);
  }		
}
 
Example #8
Source File: ExtractorTest.java    From extract with MIT License 6 votes vote down vote up
@Test
public void testGarbage() throws Throwable {
	final Extractor extractor = new Extractor();

	TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/garbage.bin").getPath()));


	thrown.expect(IOException.class);
	thrown.expectMessage("");
	thrown.expectCause(new CauseMatcher(TikaException.class, "Parse error"));

	final int read;

	try (final Reader reader = tikaDocument.getReader()) {
		read = reader.read();
	} catch (IOException e) {
		Assert.assertEquals("application/octet-stream", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE));
		throw e;
	}

	Assert.fail(String.format("Read \"%d\" while expecting exception.", read));
}
 
Example #9
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example #10
Source File: ContentMetadataExtracterTagMappingTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
    Map<String, Serializable> rawMap = super.extractRaw(reader);
    
    // Add some test keywords to those actually extracted from the file including a nodeRef
    List<String> keywords = new ArrayList<String>(Arrays.asList(
            new String[] { existingTagNodeRef, TAG_2, TAG_3, TAG_NONEXISTENT_NODEREF }));
    Serializable extractedKeywords = rawMap.get(Metadata.KEYWORDS);
    if (extractedKeywords != null && extractedKeywords instanceof String)
    {
        keywords.add((String) extractedKeywords);
    }
    else if (extractedKeywords != null && extractedKeywords instanceof Collection<?>)
    {
        keywords.addAll((Collection<? extends String>) extractedKeywords);
    }
    putRawValue(Metadata.KEYWORDS, (Serializable) keywords, rawMap);
    return rawMap;
}
 
Example #11
Source File: TikaAutoMetadataExtracterTest.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
@Override
public void setUp() throws Exception
{
    super.setUp();
    
    TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
    extracter = new TikaAutoMetadataExtracter(config);
    extracter.setDictionaryService(dictionaryService);
    extracter.register();
    
    // Attach some extra mappings, using the Tika
    //  metadata keys namespace
    // These will be tested later
    HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
          extracter.getMapping()
    );
    
    Set<QName> tlaSet = new HashSet<QName>();
    tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
    newMap.put( Metadata.CONTENT_TYPE, tlaSet );
    
    extracter.setMapping(newMap);
}
 
Example #12
Source File: ReplayCrawl.java    From ache with Apache License 2.0 6 votes vote down vote up
private Metadata createHeadersMetadata(Page page) {
    Map<String, List<String>> headers = page.getResponseHeaders();
    Metadata metadata = new Metadata();
    for (Entry<String, List<String>> header : headers.entrySet()) {
        for (String value : header.getValue()) {
            metadata.set(header.getKey(), value);
        }
    }
    return metadata;
}
 
Example #13
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Because some editors use JPEG_IMAGE_HEIGHT_TAG when
 * saving JPEG images , a more reliable source for
 * image size are the values provided by Tika
 * and not the exif/tiff metadata read from the file
 * This will override the tiff:Image size 
 * which gets embedded into the alfresco node properties
 * for jpeg files that contain such exif information
 */
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
    if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE)))
    {
        //check if the image has exif information
        if(metadata.get(EXIF_IMAGE_WIDTH_TAG) != null
                && metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null
                && metadata.get(COMPRESSION_TAG) != null)
        {
            //replace the exif size properties that will be embedded in the node with
            //the guessed dimensions from Tika
            putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties);
            putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties);
            putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties);
            putRawValue(JPEG_IMAGE_WIDTH_TAG, metadata.get(EXIF_IMAGE_WIDTH_TAG), properties);
        }
    }
    return properties;
}
 
Example #14
Source File: TikaPoweredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String getMetadataValue(Metadata metadata, String key)
{
    if (metadata.isMultiValued(key))
    {
        String[] parts = metadata.getValues(key);
        
        // use Set to prevent duplicates
        Set<String> value = new LinkedHashSet<String>(parts.length);
        
        for (int i = 0; i < parts.length; i++)
        {
            value.add(parts[i]);
        }
        
        String valueStr = value.toString();
        
        // remove leading/trailing braces []
        return valueStr.substring(1, valueStr.length() - 1);
    }
    else
    {
        return metadata.get(key);
    }
}
 
Example #15
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException {
	final Detector detector = config.getDetector();
	final TikaInputStream inputStream = createInputStream(uri);
	final Metadata metadata = new Metadata();

	// Set the file name. This provides some level of type-hinting.
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());

	// Detect the content type.
	String contentType = detector.detect(inputStream, metadata).toString();

	inputStream.close();

	// Return the default content-type if undetermined.
	if (contentType == null || contentType.isEmpty()) {
		return MediaType.OCTET_STREAM.toString();
	}

	return contentType;
}
 
Example #16
Source File: Page.java    From ache with Apache License 2.0 6 votes vote down vote up
private void parseResponseHeaders(Metadata headerAsMetadata) {
    Map<String, List<String>> responseHeaders = new HashMap<>();
    String[] names = headerAsMetadata.names();
    if(names != null && names.length > 0) {
        for(String name : names) {
            List<String> values = Arrays.asList(headerAsMetadata.getValues(name));
            if(values.isEmpty()) {
                continue;
            }
            responseHeaders.put(name, values);
            if("content-type".compareToIgnoreCase(name) == 0) {
                this.contentType = values.get(0);
            }
        }
    }
    this.responseHeaders = responseHeaders;
}
 
Example #17
Source File: UniversalEncodingDetector.java    From onedev with MIT License 6 votes vote down vote up
public static Charset detect(InputStream input) throws IOException {
    input.mark(LOOKAHEAD);
    try {
        UniversalEncodingListener listener =
                new UniversalEncodingListener(new Metadata());

        byte[] b = new byte[BUFSIZE];
        int n = 0;
        int m = input.read(b);
        while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
            n += m;
            listener.handleData(b, 0, m);
            m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
        }

        return listener.dataEnd();
    } catch (IOException e) {
        throw e;
    } finally {
        input.reset();
    }
}
 
Example #18
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Generate the description
 * 
 * @param metadata     the metadata extracted from the file
 * @return          the description
 */
@SuppressWarnings("deprecation")
private String generateDescription(Metadata metadata)
{
    StringBuilder result = new StringBuilder();
    if (metadata.get(Metadata.TITLE) != null)
    {
        result.append(metadata.get(Metadata.TITLE));
        if (metadata.get(XMPDM.ALBUM) != null)
        {
           result
            .append(" - ")
            .append(metadata.get(XMPDM.ALBUM));
        }
        if (metadata.get(XMPDM.ARTIST) != null)
        {
           result
            .append(" (")
            .append(metadata.get(XMPDM.ARTIST))
            .append(")");
        }
    }
    
    return result.toString();
}
 
Example #19
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
/**
 * Generates the release date
 */
private Date generateReleaseDate(Metadata metadata)
{
   String date = metadata.get(XMPDM.RELEASE_DATE);
   if(date == null || date.length() == 0)
   {
      return null;
   }
      
   // Is it just a year?
   if(date.matches("\\d\\d\\d\\d"))
   {
      // Just a year, we need a full date
      // Go for the 1st of the 1st
      Calendar c = Calendar.getInstance();
      c.set(
            Integer.parseInt(date), Calendar.JANUARY, 1,
            0, 0, 0
      );
      c.set(Calendar.MILLISECOND, 0);
      return c.getTime();
   }
   
   // Treat as a normal date
   return makeDate(date);
}
 
Example #20
Source File: TikaAudioMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
   // Most things can go with the default Tika -> Alfresco Mapping
   // Handle the few special cases here
   
   // The description is special
   putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
   
   // The release date can be fiddly
   Date releaseDate = generateReleaseDate(metadata);
   putRawValue(KEY_CREATED, releaseDate, properties);
   putRawValue(XMPDM.RELEASE_DATE.getName(), releaseDate, properties);
   
   // TODO Get the Lyrics from the content
   //putRawValue(KEY_LYRICS, getLyrics(), properties);
   
   // All done
   return properties;
}
 
Example #21
Source File: TikaLuceneContentExtractor.java    From cxf with Apache License 2.0 5 votes vote down vote up
private Document extractAll(final InputStream in,
                            LuceneDocumentMetadata documentMetadata,
                            boolean extractContent,
                            boolean extractMetadata) {

    TikaContent content =
        extractor.extract(in, extractContent ? new ToTextContentHandler() : null);

    if (content == null) {
        return null;
    }
    final Document document = new Document();

    if (documentMetadata == null) {
        documentMetadata = defaultDocumentMetadata;
    }
    if (content.getContent() != null) {
        document.add(getContentField(documentMetadata, content.getContent()));
    }

    if (extractMetadata) {
        Metadata metadata = content.getMetadata();
        for (final String property: metadata.names()) {
            addField(document, documentMetadata, property, metadata.get(property));
        }
    }

    if (!StringUtils.isEmpty(documentMetadata.getSource())) {
        document.add(new StringField(documentMetadata.getSourceFieldName(),
            documentMetadata.getSource(), Store.YES));
    }

    return document;

}
 
Example #22
Source File: FileParser.java    From scava with Eclipse Public License 2.0 5 votes vote down vote up
/**
 * 
 * @param file
 * @return Null if the file is not supported
 * @throws Exception 
 */
public static FileContent extractText(File file) throws Exception
{
	FileInputStream fis = fileToInputStream(file);
	BufferedInputStream bif = new BufferedInputStream(fis);
	Metadata metadata = new Metadata();
	metadata.add(Metadata.RESOURCE_NAME_KEY, file.getName());
	FileContent fileContent = extractText(bif, metadata);
	bif.close();
	fis.close();
	return fileContent;
}
 
Example #23
Source File: OpenDocumentMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
    @Override
    protected Map<String, Serializable> extractSpecific(Metadata metadata,
         Map<String, Serializable> properties, Map<String, String> headers) 
    {
       putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties);
       putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties);
       putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties);
       putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties);
       putRawValue(KEY_GENERATOR, metadata.get("generator"), properties);
       putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties);
       putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties);
       putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties);
//     putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties);
//     putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties);
           
       // Handle user-defined properties dynamically
       Map<String, Set<QName>> mapping = super.getMapping();
       for (String key : mapping.keySet())
       {
           if (metadata.get(CUSTOM_PREFIX + key) != null)
           {
                putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties);
           }
       }
       
       return properties;
    }
 
Example #24
Source File: ParsingReader.java    From extract with MIT License 5 votes vote down vote up
/**
 * Creates a reader for the content of the given binary stream
 * with the given document metadata. The given parser is used for the
 * parsing task that is run with the given executor.
 *
 * The created reader will be responsible for closing the given stream.
 * The stream and any associated resources will be closed at or before
 * the time when the {@link #close()} method is called on this reader.
 *
 * @param parser parser instance
 * @param input binary stream
 * @param metadata document metadata
 * @param context parsing context
 * @throws IOException if the document can not be parsed
 */
public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext
		context, final Function<Writer, ContentHandler> handler) throws IOException {
	final PipedReader pipedReader = new PipedReader();

	this.parser = parser;
	reader = new BufferedReader(pipedReader);

	try {
		writer = new PipedWriter(pipedReader);
	} catch (IOException e) {
		throw new IllegalStateException(e); // Should never happen.
	}

	this.input = input;
	this.metadata = metadata;
	this.context = context;

	// Generate the handler.
	this.handler = handler.apply(writer);

	parse();
	
	// TIKA-203: Buffer first character to force metadata extraction.
	reader.mark(1);

	//noinspection ResultOfMethodCallIgnored
	reader.read();
	reader.reset();
}
 
Example #25
Source File: EmbedSpawner.java    From extract with MIT License 5 votes vote down vote up
private void writeEmbed(final TikaInputStream tis, final EmbeddedTikaDocument embed, final String name) throws IOException {
	final Path destination = outputPath.resolve(embed.getHash());
	final Path source;

	final Metadata metadata = embed.getMetadata();
	final Object container = tis.getOpenContainer();

	// If the input is a container, write it to a temporary file so that it can then be copied atomically.
	// This happens with, for example, an Outlook Message that is an attachment of another Outlook Message.
	if (container instanceof DirectoryEntry) {
		try (final TemporaryResources tmp = new TemporaryResources();
		     final POIFSFileSystem fs = new POIFSFileSystem()) {
			source = tmp.createTempFile();
			saveEntries((DirectoryEntry) container, fs.getRoot());

			try (final OutputStream output = Files.newOutputStream(source)) {
				fs.writeFilesystem(output);
			}
		}
	} else {
		source = tis.getPath();
	}

	// Set the content-length as it isn't (always?) set by Tika for embeds.
	if (null == metadata.get(Metadata.CONTENT_LENGTH)) {
		metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source)));
	}

	// To prevent massive duplication and because the disk is only a storage for underlying data, save using the
	// straight hash as a filename.
	try {
		Files.copy(source, destination);
	} catch (final FileAlreadyExistsException e) {
		if (Files.size(source) != Files.size(destination)) {
			Files.copy(source, destination, StandardCopyOption.REPLACE_EXISTING);
		} else {
			logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root);
		}
	}
}
 
Example #26
Source File: CachingTesseractOCRParser.java    From extract with MIT License 5 votes vote down vote up
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata,
                         final ParseContext context, TesseractOCRConfig config, final boolean inline)
		throws IOException, SAXException, TikaException {
	try (final TikaInputStream tis = TikaInputStream.get(in)) {
		cachedParse(tis, handler, metadata, context, config, inline);
	} catch (final InterruptedException e) {
		throw new TikaException("Interrupted.", e);
	}
}
 
Example #27
Source File: ParseResultTest.java    From beam with Apache License 2.0 5 votes vote down vote up
static Metadata getMetadata() {
  Metadata m = new Metadata();
  m.add("Author", "BeamTikaUser");
  m.add("Author", "BeamTikaUser2");
  m.add("Date", "2017-09-01");
  return m;
}
 
Example #28
Source File: MediaTypeDisablingDocumentSelector.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public boolean select(Metadata metadata)
{
    String contentType = metadata.get(Metadata.CONTENT_TYPE);
    if (contentType == null || contentType.equals("") || disabledMediaTypes == null)
    {
        return true;
    }
    return !disabledMediaTypes.contains(contentType);
}
 
Example #29
Source File: AttachAttribute.java    From entando-components with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}
 
Example #30
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillMetadata(Metadata metadata, String contentType, String uri) {

		// Set the file name.
		if (uri != null) {
			metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName());
		}

		// Normalise the content-type.
		contentType = normalizeContentType(contentType);

		// Set the content-type.
		if (contentType != null) {
			metadata.add(HttpHeaders.CONTENT_TYPE, contentType);
		}
	}