org.apache.tika.metadata.Metadata Java Examples
The following examples show how to use
org.apache.tika.metadata.Metadata.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DetectMimeTypeBuilder.java From kite with Apache License 2.0 | 7 votes |
/** * Detects the content type of the given input event. Returns * <code>application/octet-stream</code> if the type of the event can not be * detected. * <p> * It is legal for the event headers or body to be empty. The detector may * read bytes from the start of the body stream to help in type detection. * * @return detected media type, or <code>application/octet-stream</code> */ private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) { MediaType mediaType; try { mediaType = getDetector().detect(in, metadata); } catch (IOException e) { throw new MorphlineRuntimeException(e); } String mediaTypeStr = mediaType.toString(); if (excludeParameters) { int i = mediaTypeStr.indexOf(';'); if (i >= 0) { mediaTypeStr = mediaTypeStr.substring(0, i); } } return mediaTypeStr; }
Example #2
Source File: StoredFile.java From openmeetings with Apache License 2.0 | 6 votes |
private void init(String inName, String inExt, InputStream is) { if (Strings.isEmpty(inExt)) { int idx = inName.lastIndexOf('.'); name = idx < 0 ? inName : inName.substring(0, idx); ext = getFileExt(inName); } else { name = inName; ext = inExt.toLowerCase(Locale.ROOT); } Metadata md = new Metadata(); md.add(RESOURCE_NAME_KEY, String.format(FILE_NAME_FMT, name, ext)); try { mime = tika.getDetector().detect(is == null ? null : TikaInputStream.get(is), md); } catch (Throwable e) { mime = null; log.error("Unexpected exception while detecting mime type", e); } }
Example #3
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, txtParser); txtParser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
Example #4
Source File: TikaDocumentItemProcessor.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
private void extractPageCountMetadata(Document doc, Set<String> metaKeys, Metadata metadata) { if (metaKeys.contains("xmpTPg:NPages")) { doc.getAssociativeArray().put("X-TL-PAGE-COUNT", metadata.get("xmpTPg:NPages")); } else if (metaKeys.contains("Page-Count")) { doc.getAssociativeArray().put("X-TL-PAGE-COUNT", metadata.get("Page-Count")); } else if (metaKeys.contains("meta:page-count")) { doc.getAssociativeArray().put("X-TL-PAGE-COUNT", metadata.get("meta:page-count")); } else { doc.getAssociativeArray().put("X-TL-PAGE-COUNT", "TL_PAGE_COUNT_UNKNOWN"); } }
Example #5
Source File: CSVDetector.java From data-prep with Apache License 2.0 | 6 votes |
/** * A private utility class used to detect format. * * @param metadata the specified TIKA {@link Metadata} * @param inputStream the specified input stream * @return either null or an CSV format * @throws IOException */ private Format detectText(Metadata metadata, InputStream inputStream) throws IOException { MediaType mediaType = mimeTypes.detect(inputStream, metadata); if (mediaType != null) { String mediaTypeName = mediaType.toString(); if (StringUtils.startsWith(mediaTypeName, TEXT_PLAIN)) { Charset charset = null; try { charset = encodingDetector.detect(inputStream, metadata); } catch (IOException e) { LOGGER.debug("Unable to detect the encoding for a data set in CSV format", e); } if (charset != null) { return new Format(csvFormatFamily, charset.name()); } else { return new Format(csvFormatFamily, FormatUtils.DEFAULT_ENCODING); } } } return null; }
Example #6
Source File: ExtractorTest.java From extract with MIT License | 6 votes |
@Test public void testIgnoreEmbeds() throws Throwable { final Extractor extractor = new Extractor(); extractor.setEmbedHandling(Extractor.EmbedHandling.IGNORE); Assert.assertEquals(extractor.getEmbedHandling(), Extractor.EmbedHandling.IGNORE); TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/ocr/embedded.pdf").getPath())); String text; try (final Reader reader = tikaDocument.getReader()) { text = Spewer.toString(reader); } Assert.assertEquals("application/pdf", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE)); Assert.assertEquals("\n\n\n\n", text); }
Example #7
Source File: TikaFormat.java From gate-core with GNU Lesser General Public License v3.0 | 6 votes |
private void setTikaFeature(Metadata metadata, Property property, FeatureMap fmap) { String value = metadata.get(property); if (value == null) { return; } value = value.trim(); if (value.length() == 0) { return; } String key = property.getName().toUpperCase(); if (fmap.containsKey(key)) { fmap.put("TIKA_" + key, value); } else { fmap.put(key, value); fmap.put("TIKA_" + key, value); } }
Example #8
Source File: ExtractorTest.java From extract with MIT License | 6 votes |
@Test public void testGarbage() throws Throwable { final Extractor extractor = new Extractor(); TikaDocument tikaDocument = extractor.extract(Paths.get(getClass().getResource("/documents/garbage.bin").getPath())); thrown.expect(IOException.class); thrown.expectMessage(""); thrown.expectCause(new CauseMatcher(TikaException.class, "Parse error")); final int read; try (final Reader reader = tikaDocument.getReader()) { read = reader.read(); } catch (IOException e) { Assert.assertEquals("application/octet-stream", tikaDocument.getMetadata().get(Metadata.CONTENT_TYPE)); throw e; } Assert.fail(String.format("Read \"%d\" while expecting exception.", read)); }
Example #9
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String extractMeta(String uri, String contentType) throws Exception { final AutoDetectParser parser = createParser(); final Metadata metadata = new Metadata(); fillMetadata(parser, metadata, contentType, uri); final TikaInputStream inputStream = createInputStream(uri, metadata); parser.parse(inputStream, new DefaultHandler(), metadata); Map meta = new HashMap(); for (String name : metadata.names()) { String[] values = metadata.getValues(name); meta.put(name, values); } inputStream.close(); return new Gson().toJson(meta); }
Example #10
Source File: ContentMetadataExtracterTagMappingTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
@SuppressWarnings("unchecked") public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable { Map<String, Serializable> rawMap = super.extractRaw(reader); // Add some test keywords to those actually extracted from the file including a nodeRef List<String> keywords = new ArrayList<String>(Arrays.asList( new String[] { existingTagNodeRef, TAG_2, TAG_3, TAG_NONEXISTENT_NODEREF })); Serializable extractedKeywords = rawMap.get(Metadata.KEYWORDS); if (extractedKeywords != null && extractedKeywords instanceof String) { keywords.add((String) extractedKeywords); } else if (extractedKeywords != null && extractedKeywords instanceof Collection<?>) { keywords.addAll((Collection<? extends String>) extractedKeywords); } putRawValue(Metadata.KEYWORDS, (Serializable) keywords, rawMap); return rawMap; }
Example #11
Source File: TikaAutoMetadataExtracterTest.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
@Override public void setUp() throws Exception { super.setUp(); TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig"); extracter = new TikaAutoMetadataExtracter(config); extracter.setDictionaryService(dictionaryService); extracter.register(); // Attach some extra mappings, using the Tika // metadata keys namespace // These will be tested later HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>( extracter.getMapping() ); Set<QName> tlaSet = new HashSet<QName>(); tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY); newMap.put( Metadata.CONTENT_TYPE, tlaSet ); extracter.setMapping(newMap); }
Example #12
Source File: ReplayCrawl.java From ache with Apache License 2.0 | 6 votes |
private Metadata createHeadersMetadata(Page page) { Map<String, List<String>> headers = page.getResponseHeaders(); Metadata metadata = new Metadata(); for (Entry<String, List<String>> header : headers.entrySet()) { for (String value : header.getValue()) { metadata.set(header.getKey(), value); } } return metadata; }
Example #13
Source File: TikaAutoMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Because some editors use JPEG_IMAGE_HEIGHT_TAG when * saving JPEG images , a more reliable source for * image size are the values provided by Tika * and not the exif/tiff metadata read from the file * This will override the tiff:Image size * which gets embedded into the alfresco node properties * for jpeg files that contain such exif information */ @Override protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties, Map<String,String> headers) { if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE))) { //check if the image has exif information if(metadata.get(EXIF_IMAGE_WIDTH_TAG) != null && metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null && metadata.get(COMPRESSION_TAG) != null) { //replace the exif size properties that will be embedded in the node with //the guessed dimensions from Tika putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(EXIF_IMAGE_HEIGHT_TAG)), properties); putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(EXIF_IMAGE_WIDTH_TAG)), properties); putRawValue(JPEG_IMAGE_HEIGHT_TAG, metadata.get(EXIF_IMAGE_HEIGHT_TAG), properties); putRawValue(JPEG_IMAGE_WIDTH_TAG, metadata.get(EXIF_IMAGE_WIDTH_TAG), properties); } } return properties; }
Example #14
Source File: TikaPoweredMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
private String getMetadataValue(Metadata metadata, String key) { if (metadata.isMultiValued(key)) { String[] parts = metadata.getValues(key); // use Set to prevent duplicates Set<String> value = new LinkedHashSet<String>(parts.length); for (int i = 0; i < parts.length; i++) { value.add(parts[i]); } String valueStr = value.toString(); // remove leading/trailing braces [] return valueStr.substring(1, valueStr.length() - 1); } else { return metadata.get(key); } }
Example #15
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
public static String detectContentType(String uri) throws FileNotFoundException, IOException, TikaException { final Detector detector = config.getDetector(); final TikaInputStream inputStream = createInputStream(uri); final Metadata metadata = new Metadata(); // Set the file name. This provides some level of type-hinting. metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); // Detect the content type. String contentType = detector.detect(inputStream, metadata).toString(); inputStream.close(); // Return the default content-type if undetermined. if (contentType == null || contentType.isEmpty()) { return MediaType.OCTET_STREAM.toString(); } return contentType; }
Example #16
Source File: Page.java From ache with Apache License 2.0 | 6 votes |
private void parseResponseHeaders(Metadata headerAsMetadata) { Map<String, List<String>> responseHeaders = new HashMap<>(); String[] names = headerAsMetadata.names(); if(names != null && names.length > 0) { for(String name : names) { List<String> values = Arrays.asList(headerAsMetadata.getValues(name)); if(values.isEmpty()) { continue; } responseHeaders.put(name, values); if("content-type".compareToIgnoreCase(name) == 0) { this.contentType = values.get(0); } } } this.responseHeaders = responseHeaders; }
Example #17
Source File: UniversalEncodingDetector.java From onedev with MIT License | 6 votes |
public static Charset detect(InputStream input) throws IOException { input.mark(LOOKAHEAD); try { UniversalEncodingListener listener = new UniversalEncodingListener(new Metadata()); byte[] b = new byte[BUFSIZE]; int n = 0; int m = input.read(b); while (m != -1 && n < LOOKAHEAD && !listener.isDone()) { n += m; listener.handleData(b, 0, m); m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n)); } return listener.dataEnd(); } catch (IOException e) { throw e; } finally { input.reset(); } }
Example #18
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Generate the description * * @param metadata the metadata extracted from the file * @return the description */ @SuppressWarnings("deprecation") private String generateDescription(Metadata metadata) { StringBuilder result = new StringBuilder(); if (metadata.get(Metadata.TITLE) != null) { result.append(metadata.get(Metadata.TITLE)); if (metadata.get(XMPDM.ALBUM) != null) { result .append(" - ") .append(metadata.get(XMPDM.ALBUM)); } if (metadata.get(XMPDM.ARTIST) != null) { result .append(" (") .append(metadata.get(XMPDM.ARTIST)) .append(")"); } } return result.toString(); }
Example #19
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
/** * Generates the release date */ private Date generateReleaseDate(Metadata metadata) { String date = metadata.get(XMPDM.RELEASE_DATE); if(date == null || date.length() == 0) { return null; } // Is it just a year? if(date.matches("\\d\\d\\d\\d")) { // Just a year, we need a full date // Go for the 1st of the 1st Calendar c = Calendar.getInstance(); c.set( Integer.parseInt(date), Calendar.JANUARY, 1, 0, 0, 0 ); c.set(Calendar.MILLISECOND, 0); return c.getTime(); } // Treat as a normal date return makeDate(date); }
Example #20
Source File: TikaAudioMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 6 votes |
@Override protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties, Map<String,String> headers) { // Most things can go with the default Tika -> Alfresco Mapping // Handle the few special cases here // The description is special putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties); // The release date can be fiddly Date releaseDate = generateReleaseDate(metadata); putRawValue(KEY_CREATED, releaseDate, properties); putRawValue(XMPDM.RELEASE_DATE.getName(), releaseDate, properties); // TODO Get the Lyrics from the content //putRawValue(KEY_LYRICS, getLyrics(), properties); // All done return properties; }
Example #21
Source File: TikaLuceneContentExtractor.java From cxf with Apache License 2.0 | 5 votes |
private Document extractAll(final InputStream in, LuceneDocumentMetadata documentMetadata, boolean extractContent, boolean extractMetadata) { TikaContent content = extractor.extract(in, extractContent ? new ToTextContentHandler() : null); if (content == null) { return null; } final Document document = new Document(); if (documentMetadata == null) { documentMetadata = defaultDocumentMetadata; } if (content.getContent() != null) { document.add(getContentField(documentMetadata, content.getContent())); } if (extractMetadata) { Metadata metadata = content.getMetadata(); for (final String property: metadata.names()) { addField(document, documentMetadata, property, metadata.get(property)); } } if (!StringUtils.isEmpty(documentMetadata.getSource())) { document.add(new StringField(documentMetadata.getSourceFieldName(), documentMetadata.getSource(), Store.YES)); } return document; }
Example #22
Source File: FileParser.java From scava with Eclipse Public License 2.0 | 5 votes |
/** * * @param file * @return Null if the file is not supported * @throws Exception */ public static FileContent extractText(File file) throws Exception { FileInputStream fis = fileToInputStream(file); BufferedInputStream bif = new BufferedInputStream(fis); Metadata metadata = new Metadata(); metadata.add(Metadata.RESOURCE_NAME_KEY, file.getName()); FileContent fileContent = extractText(bif, metadata); bif.close(); fis.close(); return fileContent; }
Example #23
Source File: OpenDocumentMetadataExtracter.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@SuppressWarnings("deprecation") @Override protected Map<String, Serializable> extractSpecific(Metadata metadata, Map<String, Serializable> properties, Map<String, String> headers) { putRawValue(KEY_CREATION_DATE, getDateOrNull(metadata.get(Metadata.CREATION_DATE)), properties); putRawValue(KEY_CREATOR, metadata.get(Metadata.CREATOR), properties); putRawValue(KEY_DATE, getDateOrNull(metadata.get(Metadata.DATE)), properties); putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.DESCRIPTION), properties); putRawValue(KEY_GENERATOR, metadata.get("generator"), properties); putRawValue(KEY_INITIAL_CREATOR, metadata.get("initial-creator"), properties); putRawValue(KEY_KEYWORD, metadata.get(Metadata.KEYWORDS), properties); putRawValue(KEY_LANGUAGE, metadata.get(Metadata.LANGUAGE), properties); // putRawValue(KEY_PRINT_DATE, getDateOrNull(metadata.get(Metadata.)), rawProperties); // putRawValue(KEY_PRINTED_BY, metadata.get(Metadata.), rawProperties); // Handle user-defined properties dynamically Map<String, Set<QName>> mapping = super.getMapping(); for (String key : mapping.keySet()) { if (metadata.get(CUSTOM_PREFIX + key) != null) { putRawValue(key, metadata.get(CUSTOM_PREFIX + key), properties); } } return properties; }
Example #24
Source File: ParsingReader.java From extract with MIT License | 5 votes |
/** * Creates a reader for the content of the given binary stream * with the given document metadata. The given parser is used for the * parsing task that is run with the given executor. * * The created reader will be responsible for closing the given stream. * The stream and any associated resources will be closed at or before * the time when the {@link #close()} method is called on this reader. * * @param parser parser instance * @param input binary stream * @param metadata document metadata * @param context parsing context * @throws IOException if the document can not be parsed */ public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext context, final Function<Writer, ContentHandler> handler) throws IOException { final PipedReader pipedReader = new PipedReader(); this.parser = parser; reader = new BufferedReader(pipedReader); try { writer = new PipedWriter(pipedReader); } catch (IOException e) { throw new IllegalStateException(e); // Should never happen. } this.input = input; this.metadata = metadata; this.context = context; // Generate the handler. this.handler = handler.apply(writer); parse(); // TIKA-203: Buffer first character to force metadata extraction. reader.mark(1); //noinspection ResultOfMethodCallIgnored reader.read(); reader.reset(); }
Example #25
Source File: EmbedSpawner.java From extract with MIT License | 5 votes |
private void writeEmbed(final TikaInputStream tis, final EmbeddedTikaDocument embed, final String name) throws IOException { final Path destination = outputPath.resolve(embed.getHash()); final Path source; final Metadata metadata = embed.getMetadata(); final Object container = tis.getOpenContainer(); // If the input is a container, write it to a temporary file so that it can then be copied atomically. // This happens with, for example, an Outlook Message that is an attachment of another Outlook Message. if (container instanceof DirectoryEntry) { try (final TemporaryResources tmp = new TemporaryResources(); final POIFSFileSystem fs = new POIFSFileSystem()) { source = tmp.createTempFile(); saveEntries((DirectoryEntry) container, fs.getRoot()); try (final OutputStream output = Files.newOutputStream(source)) { fs.writeFilesystem(output); } } } else { source = tis.getPath(); } // Set the content-length as it isn't (always?) set by Tika for embeds. if (null == metadata.get(Metadata.CONTENT_LENGTH)) { metadata.set(Metadata.CONTENT_LENGTH, Long.toString(Files.size(source))); } // To prevent massive duplication and because the disk is only a storage for underlying data, save using the // straight hash as a filename. try { Files.copy(source, destination); } catch (final FileAlreadyExistsException e) { if (Files.size(source) != Files.size(destination)) { Files.copy(source, destination, StandardCopyOption.REPLACE_EXISTING); } else { logger.info("Temporary file for document \"{}\" in \"{}\" already exists.", name, root); } } }
Example #26
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata, final ParseContext context, TesseractOCRConfig config, final boolean inline) throws IOException, SAXException, TikaException { try (final TikaInputStream tis = TikaInputStream.get(in)) { cachedParse(tis, handler, metadata, context, config, inline); } catch (final InterruptedException e) { throw new TikaException("Interrupted.", e); } }
Example #27
Source File: ParseResultTest.java From beam with Apache License 2.0 | 5 votes |
static Metadata getMetadata() { Metadata m = new Metadata(); m.add("Author", "BeamTikaUser"); m.add("Author", "BeamTikaUser2"); m.add("Date", "2017-09-01"); return m; }
Example #28
Source File: MediaTypeDisablingDocumentSelector.java From alfresco-repository with GNU Lesser General Public License v3.0 | 5 votes |
@Override public boolean select(Metadata metadata) { String contentType = metadata.get(Metadata.CONTENT_TYPE); if (contentType == null || contentType.equals("") || disabledMediaTypes == null) { return true; } return !disabledMediaTypes.contains(contentType); }
Example #29
Source File: AttachAttribute.java From entando-components with GNU Lesser General Public License v3.0 | 5 votes |
@Override public String getIndexeableFieldValue() { StringBuilder buffer = new StringBuilder(); if (null != super.getIndexeableFieldValue()) { buffer.append(super.getIndexeableFieldValue()); } String extraValue = null; ResourceInterface resource = this.getResource(); if (resource != null) { InputStream is = ((AttachResource) resource).getResourceStream(); if (null != is) { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(-1); Metadata metadata = new Metadata(); try { parser.parse(is, handler, metadata); extraValue = handler.toString(); } catch (Throwable t) { _logger.error("Error while processing the parsing", t); } finally { try { is.close(); } catch (IOException ex) { _logger.error("Error closing stream", ex); } } } } if (null != extraValue) { buffer.append(" ").append(extraValue); } return buffer.toString(); }
Example #30
Source File: NodeTika.java From node-tika with MIT License | 5 votes |
private static void fillMetadata(Metadata metadata, String contentType, String uri) { // Set the file name. if (uri != null) { metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(uri).getName()); } // Normalise the content-type. contentType = normalizeContentType(contentType); // Set the content-type. if (contentType != null) { metadata.add(HttpHeaders.CONTENT_TYPE, contentType); } }