Java Code Examples for org.apache.tika.metadata.Metadata#names()

The following examples show how to use org.apache.tika.metadata.Metadata#names() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaExtractor.java    From ache with Apache License 2.0 6 votes vote down vote up
public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}
 
Example 2
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example 3
Source File: TikaTest.java    From tika-server with Apache License 2.0 6 votes vote down vote up
/**
 * Test that in at least one item in metadataList, all keys and values
 * in minExpected are contained.
 * <p>
 * The values in minExpected are tested for whether they are contained
 * within a value in the target.  If minExpected=&dquot;text/vbasic&dquot;  and
 * what was actually found in the target within metadatalist is
 * &dquot;text/vbasic; charset=windows-1252&dquot;,
 * that is counted as a hit.
 *
 * @param minExpected
 * @param metadataList
 */
public static void assertContainsAtLeast(Metadata minExpected, List<Metadata> metadataList) {

    for (Metadata m : metadataList) {
        int foundPropertyCount = 0;
        for (String n : minExpected.names()) {
            int foundValCount = 0;
            for (String foundVal : m.getValues(n)) {
                for (String expectedVal : minExpected.getValues(n)) {
                    if (foundVal.contains(expectedVal)) {
                        foundValCount++;
                    }
                }
            }
            if (foundValCount == minExpected.getValues(n).length) {
                foundPropertyCount++;
            }
        }
        if (foundPropertyCount == minExpected.names().length) {
            //found everything!
            return;
        }
    }
    //TODO: figure out how to have more informative error message
    fail("Couldn't find everything within a single metadata item");
}
 
Example 4
Source File: TikaContentExtractor.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}
 
Example 5
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example 6
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Sets the input metadata for {@link Parser#parse}. */
public ParseFiles withInputMetadata(Metadata metadata) {
  Metadata inputMetadata = this.getInputMetadata();
  if (inputMetadata != null) {
    for (String name : metadata.names()) {
      inputMetadata.set(name, metadata.get(name));
    }
  } else {
    inputMetadata = metadata;
  }
  return toBuilder().setInputMetadata(inputMetadata).build();
}
 
Example 7
Source File: TikaProcessor.java    From jesterj with Apache License 2.0 5 votes vote down vote up
@Override
public Document[]   processDocument(Document document) {
  try {
    byte[] rawData = document.getRawData();
    if (rawData == null) {
      log.debug("Skipping document without data in " + getName());
      return new Document[]{document};
    }
    Tika tika = new Tika(tikaConfig);
    tika.setMaxStringLength(document.getRawData().length);
    Metadata metadata = new Metadata();
    try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) {
      String textContent = tika.parseToString(bais, metadata, maxLength);
      if (replaceRaw) {
        document.setRawData(textContent.getBytes(StandardCharsets.UTF_8));
      }
      if (destField != null) {
        document.put(destField,textContent);
      }
      for (String name : metadata.names()) {
        document.put(sanitize(name) + plusSuffix(), metadata.get(name));
      }
    } catch (IOException | TikaException e) {
      log.debug("Tika processing failure!", e);
      // if tika can't parse it we certainly don't want random binary crap in the index
      document.setStatus(Status.ERROR);
    }
  } catch (Throwable t) {
    boolean isAccessControl = t instanceof AccessControlException;
    boolean isSecurity = t instanceof SecurityException;
    if (!isAccessControl && !isSecurity) {
      throw t;
    }
  }
  return new Document[]{document};
}
 
Example 8
Source File: TikaContentExtractor.java    From cxf with Apache License 2.0 5 votes vote down vote up
/**
 * Extract the metadata only from the input stream. Depending on media type validation,
 * the detector could be run against input stream in order to ensure that parser supports this
 * type of content.
 * @param in input stream to extract the metadata from
 * @return the extracted metadata converted to SearchBean or null if extraction is not possible
 *         or was unsuccessful
 */
public SearchBean extractMetadataToSearchBean(final InputStream in) {
    TikaContent tc = extractMetadata(in);
    if (tc == null) {
        return null;
    }
    Metadata metadata = tc.getMetadata();
    SearchBean bean = new SearchBean();
    for (final String property: metadata.names()) {
        bean.set(property, metadata.get(property));
    }
    return bean;
}
 
Example 9
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example 10
Source File: TikaLuceneContentExtractor.java    From cxf with Apache License 2.0 5 votes vote down vote up
private Document extractAll(final InputStream in,
                            LuceneDocumentMetadata documentMetadata,
                            boolean extractContent,
                            boolean extractMetadata) {

    TikaContent content =
        extractor.extract(in, extractContent ? new ToTextContentHandler() : null);

    if (content == null) {
        return null;
    }
    final Document document = new Document();

    if (documentMetadata == null) {
        documentMetadata = defaultDocumentMetadata;
    }
    if (content.getContent() != null) {
        document.add(getContentField(documentMetadata, content.getContent()));
    }

    if (extractMetadata) {
        Metadata metadata = content.getMetadata();
        for (final String property: metadata.names()) {
            addField(document, documentMetadata, property, metadata.get(property));
        }
    }

    if (!StringUtils.isEmpty(documentMetadata.getSource())) {
        document.add(new StringField(documentMetadata.getSourceFieldName(),
            documentMetadata.getSource(), Store.YES));
    }

    return document;

}
 
Example 11
Source File: ParseResult.java    From beam with Apache License 2.0 5 votes vote down vote up
private ParseResult(String fileLocation, String content, Metadata metadata, Throwable error) {
  checkArgument(fileLocation != null, "fileLocation can not be null");
  checkArgument(content != null, "content can not be null");
  checkArgument(metadata != null, "metadata can not be null");
  this.fileLocation = fileLocation;
  this.content = content;
  this.metadata = metadata;
  this.metadataNames = metadata.names();
  this.error = (error == null) ? null : new SerializableThrowable(error);
}
 
Example 12
Source File: ElasticsearchSpewerTest.java    From datashare with GNU Affero General Public License v3.0 5 votes vote down vote up
private Map<String, Object> convert(Metadata metadata) {
    Map<String, Object> map = new HashMap<>();
    for (String name: metadata.names()) {
        map.put(name, metadata.get(name));
    }
    return map;
}
 
Example 13
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
private static Map<String, String> makeMap(Metadata metadata) {
    Map<String, String> result = new HashMap<String, String>();

    for (String key : metadata.names()) {
        result.put(key, metadata.get(key));
    }

    return result;
}
 
Example 14
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}
 
Example 15
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
public static void debug(Metadata metadata) {
    for (String n : metadata.names()) {
        for (String v : metadata.getValues(n)) {
            System.out.println(n + " : "+v);
        }
    }
}
 
Example 16
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
public static void debug(List<Metadata> list) {
    int i = 0;
    for (Metadata m : list) {
        for (String n : m.names()) {
            for (String v : m.getValues(n)) {
                System.out.println(i + ": "+n + " : "+v);
            }
        }
        i++;
    }
}
 
Example 17
Source File: ExtractMediaMetadata.java    From nifi with Apache License 2.0 4 votes vote down vote up
private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs,
                                       Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    try {
        autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
    } finally {
        tikaInputStream.close();
    }

    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }

        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}
 
Example 18
Source File: MP3Reader.java    From red5-io with Apache License 2.0 4 votes vote down vote up
/**
 * Creates reader from file input stream
 * 
 * @param file
 *            file input
 * @throws IOException
 *             on IO error
 */
public MP3Reader(File file) throws IOException {
    this.file = file;
    fis = new FileInputStream(file);
    try {
        // parse the ID3 info
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        // MP3 parser
        Mp3Parser parser = new Mp3Parser();
        parser.parse(fis, handler, metadata, null);
        log.debug("Contents of the document: {}", handler.toString());
        // create meta data holder
        metaData = new MetaData();
        String val = null;
        String[] metadataNames = metadata.names();
        for (String name : metadataNames) {
            val = metadata.get(name);
            log.debug("Meta name: {} value: {}", name, val);
            if ("xmpDM:artist".equals(name)) {
                metaData.setArtist(val);
            } else if ("xmpDM:album".equals(name)) {
                metaData.setAlbum(val);
            } else if ("title".equals(name)) {
                metaData.setSongName(val);
            } else if ("xmpDM:genre".equals(name)) {
                metaData.setGenre(val);
            } else if ("xmpDM:logComment".equals(name)) {
                metaData.setComment(val);
            } else if ("xmpDM:trackNumber".equals(name)) {
                metaData.setTrack(val);
            } else if ("xmpDM:releaseDate".equals(name)) {
                metaData.setYear(val);
            } else if ("xmpDM:duration".equals(name) || "duration".equals(name)) {
                metaData.setDuration(val);
            } else if ("xmpDM:audioSampleRate".equals(name) || "samplerate".equals(name)) {
                metaData.setSampleRate(val);
            } else if ("channels".equals(name)) {
                metaData.setChannels(val);
            }
        }
        /*
         * //send album image if included List<Artwork> tagFieldList = idTag.getArtworkList(); if (tagFieldList == null || tagFieldList.isEmpty()) { log.debug("No cover art was found"); }
         * else { Artwork imageField = tagFieldList.get(0); log.debug("Picture type: {}", imageField.getPictureType()); FrameBodyAPIC imageFrameBody = new FrameBodyAPIC();
         * imageFrameBody.setImageData(imageField.getBinaryData()); if (!imageFrameBody.isImageUrl()) { byte[] imageBuffer = (byte[])
         * imageFrameBody.getObjectValue(DataTypes.OBJ_PICTURE_DATA); //set the cover image on the metadata metaData.setCovr(imageBuffer); // Create tag for onImageData event IoBuffer buf
         * = IoBuffer.allocate(imageBuffer.length); buf.setAutoExpand(true); Output out = new Output(buf); out.writeString("onImageData"); Map<Object, Object> props = new HashMap<Object,
         * Object>(); props.put("trackid", 1); props.put("data", imageBuffer); out.writeMap(props); buf.flip(); //Ugh i hate flash sometimes!! //Error #2095: flash.net.NetStream was unable
         * to invoke callback onImageData. ITag result = new Tag(IoConstants.TYPE_METADATA, 0, buf.limit(), null, 0); result.setBody(buf); //add to first frames firstTags.add(result); } }
         * } else { log.info("File did not contain ID3v2 data: {}", file.getName()); }
         */
    } catch (Exception e) {
        log.error("MP3Reader {}", e);
    }
    // ensure we have a valid sample rate
    checkValidHeader();
    // get the total bytes / file size
    fileSize = file.length();
    log.debug("File size: {}", fileSize);
    // analyze keyframes data
    analyzeKeyFrames();
    // create file metadata object
    firstTags.addFirst(createFileMeta());
    log.trace("File input stream - open: {} position: {}", fis.getChannel().isOpen(), fis.getChannel().position());
    // create a channel for reading
    fileChannel = fis.getChannel();
}
 
Example 19
Source File: S3River.java    From es-amazon-s3-river with Apache License 2.0 4 votes vote down vote up
/** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */
private String indexFile(S3ObjectSummary summary){
   if (logger.isDebugEnabled()){
      logger.debug("Trying to index '{}'", summary.getKey());
   }
   
   try{
      // Build a unique id from S3 unique summary key.
      String fileId = buildIndexIdFromS3Key(summary.getKey());

      if (feedDefinition.isJsonSupport()){
         esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary));
      } else {
         byte[] fileContent = s3.getContent(summary);

         if (fileContent != null) {
            // Compute number of chars to index.
            // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36
            int indexedChars = 100000;
            if (feedDefinition.getIndexedCharsRatio() > 0) {
               indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio());
            }

            // Parse content using Tika directly.
            Metadata fileMetadata = new Metadata();
            String parsedContent = TikaHolder.tika().parseToString(
                  new BytesStreamInput(fileContent), fileMetadata, indexedChars);

            // Store Tika metadatas into a map.
            Map<String, Object> fileMetadataMap = new HashMap<String, Object>();
            for (String key : fileMetadata.names()) {
               fileMetadataMap.put(key, fileMetadata.get(key));
            }

            esIndex(indexName, typeName, fileId,
                  jsonBuilder()
                        .startObject()
                           .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                           .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime())
                           .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition))
                           .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey()))
                           .startObject("file")
                              .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                              .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1))
                              .field("file", parsedContent)
                              .field("metadata", fileMetadataMap)
                           .endObject()
                        .endObject()
            );
            return fileId;
         }
      }
   } catch (Exception e) {
      logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage());
   }
   return null;
}
 
Example 20
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}