Java Code Examples for org.apache.tika.metadata.Metadata

The following are top voted examples for showing how to use org.apache.tika.metadata.Metadata. These examples are extracted from open source projects. You can vote up the examples you like and your votes will be used in our system to generate more good examples.
Example 1
Project: scipio-erp   File: TikaUtil.java   Source Code and License 10 votes vote down vote up
/**
 * Finds media type (through Apache Tika library), based on filename and magic numbers.
 * @throws IOException
 */
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(is);
    try {
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);
        MediaType mediaType = detector.detect(bis, md);
        return mediaType;
    } finally {
        try {
            bis.close();
        } catch (IOException e) {
            ;
        }
    }
}
 
Example 2
Project: Equella   File: MsWordExtracter.java   Source Code and License 7 votes vote down vote up
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	try
	{
		Metadata meta = new Metadata();
		ContentHandler handler = new BodyContentHandler();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		String content = handler.toString();

		if( content.length() > maxSize )
		{
			content = content.substring(0, maxSize);
		}
		outputText.append(content);
		if( LOGGER.isDebugEnabled() )
		{
			LOGGER.debug("Word Summary:" + content); //$NON-NLS-1$
		}
	}
	catch( Exception e )
	{
		throw new RuntimeException(e);
	}
}
 
Example 3
Project: extract   File: ExtractorTest.java   Source Code and License 7 votes vote down vote up
@Test
public void testDisableOcrOnEmbed() throws Throwable {
	final Extractor extractor = new Extractor();
	extractor.disableOcr();

	final Document document = factory.create(getClass().getResource("/documents/ocr/embedded.pdf"));

	String text;

	try (final Reader reader = extractor.extract(document)) {
		text = Spewer.toString(reader);
	}

	Assert.assertEquals("application/pdf", document.getMetadata().get(Metadata.CONTENT_TYPE));
	Assert.assertEquals("\n\n\n\n", text);
}
 
Example 4
Project: spring-boot-word-to-html-example   File: WordToHtmlConverter.java   Source Code and License 7 votes vote down vote up
/**
 * Converts a .docx document into HTML markup. This code
 * is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer.
 *
 * @param wordDocument  The converted .docx document.
 * @return
 */
public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {
    LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());
    try {
        InputStream input = wordDocument.getInputStream();
        Parser parser = new OOXMLParser();

        StringWriter sw = new StringWriter();
        SAXTransformerFactory factory = (SAXTransformerFactory)
                SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.setResult(new StreamResult(sw));

        Metadata metadata = new Metadata();
        metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");
        parser.parse(input, handler, metadata, new ParseContext());
        return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());
    }
    catch (IOException | SAXException | TransformerException | TikaException ex) {
        LOGGER.error("Conversion failed because an exception was thrown", ex);
        throw new DocumentConversionException(ex.getMessage(), ex);
    }
}
 
Example 5
Project: elasticsearch_my   File: TikaImpl.java   Source Code and License 6 votes vote down vote up
/**
 * parses with tika, throwing any exception hit while parsing the document
 */
// only package private for testing!
static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException {
    // check that its not unprivileged code like a script
    SpecialPermission.check();

    try {
        return AccessController.doPrivileged((PrivilegedExceptionAction<String>)
            () -> TIKA_INSTANCE.parseToString(new ByteArrayInputStream(content), metadata, limit), RESTRICTED_CONTEXT);
    } catch (PrivilegedActionException e) {
        // checked exception from tika: unbox it
        Throwable cause = e.getCause();
        if (cause instanceof TikaException) {
            throw (TikaException) cause;
        } else if (cause instanceof IOException) {
            throw (IOException) cause;
        } else {
            throw new AssertionError(cause);
        }
    }
}
 
Example 6
Project: alfresco-repository   File: ArchiveContentTransformer.java   Source Code and License 6 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  
  return context;
}
 
Example 7
Project: alfresco-repository   File: TikaAudioMetadataExtracter.java   Source Code and License 6 votes vote down vote up
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
   // Most things can go with the default Tika -> Alfresco Mapping
   // Handle the few special cases here
   
   // The description is special
   putRawValue(KEY_DESCRIPTION, generateDescription(metadata), properties);
   
   // The release date can be fiddly
   Date releaseDate = generateReleaseDate(metadata);
   putRawValue(KEY_CREATED, releaseDate, properties);
   putRawValue(XMPDM.RELEASE_DATE.getName(), releaseDate, properties);
   
   // TODO Get the Lyrics from the content
   //putRawValue(KEY_LYRICS, getLyrics(), properties);
   
   // All done
   return properties;
}
 
Example 8
Project: alfresco-repository   File: TikaAudioMetadataExtracter.java   Source Code and License 6 votes vote down vote up
/**
 * Generates the release date
 */
private Date generateReleaseDate(Metadata metadata)
{
   String date = metadata.get(XMPDM.RELEASE_DATE);
   if(date == null || date.length() == 0)
   {
      return null;
   }
      
   // Is it just a year?
   if(date.matches("\\d\\d\\d\\d"))
   {
      // Just a year, we need a full date
      // Go for the 1st of the 1st
      Calendar c = Calendar.getInstance();
      c.set(
            Integer.parseInt(date), Calendar.JANUARY, 1,
            0, 0, 0
      );
      c.set(Calendar.MILLISECOND, 0);
      return c.getTime();
   }
   
   // Treat as a normal date
   return makeDate(date);
}
 
Example 9
Project: alfresco-repository   File: TikaAudioMetadataExtracter.java   Source Code and License 6 votes vote down vote up
/**
 * Generate the description
 * 
 * @param metadata     the metadata extracted from the file
 * @return          the description
 */
@SuppressWarnings("deprecation")
private String generateDescription(Metadata metadata)
{
    StringBuilder result = new StringBuilder();
    if (metadata.get(Metadata.TITLE) != null)
    {
        result.append(metadata.get(Metadata.TITLE));
        if (metadata.get(XMPDM.ALBUM) != null)
        {
           result
            .append(" - ")
            .append(metadata.get(XMPDM.ALBUM));
        }
        if (metadata.get(XMPDM.ARTIST) != null)
        {
           result
            .append(" (")
            .append(metadata.get(XMPDM.ARTIST))
            .append(")");
        }
    }
    
    return result.toString();
}
 
Example 10
Project: alfresco-repository   File: OfficeMetadataExtracter.java   Source Code and License 6 votes vote down vote up
@SuppressWarnings("deprecation")
    @Override
    protected Map<String, Serializable> extractSpecific(Metadata metadata,
         Map<String, Serializable> properties, Map<String,String> headers) 
    {
       putRawValue(KEY_CREATE_DATETIME, metadata.get(Metadata.CREATION_DATE), properties); 
       putRawValue(KEY_LAST_SAVE_DATETIME, metadata.get(Metadata.LAST_SAVED), properties);
       putRawValue(KEY_EDIT_TIME, metadata.get(Metadata.EDIT_TIME), properties);
       putRawValue(KEY_FORMAT, metadata.get(Metadata.FORMAT), properties);
       putRawValue(KEY_KEYWORDS, metadata.get(Metadata.KEYWORDS), properties);
       putRawValue(KEY_LAST_AUTHOR, metadata.get(Metadata.LAST_AUTHOR), properties);
       putRawValue(KEY_LAST_PRINTED, metadata.get(Metadata.LAST_PRINTED), properties);
//       putRawValue(KEY_OS_VERSION, metadata.get(Metadata.OS_VERSION), properties);
//       putRawValue(KEY_THUMBNAIL, metadata.get(Metadata.THUMBNAIL), properties);
       putRawValue(KEY_PAGE_COUNT, metadata.get(Metadata.PAGE_COUNT), properties);
       putRawValue(KEY_PARAGRAPH_COUNT, metadata.get(Metadata.PARAGRAPH_COUNT), properties);
       putRawValue(KEY_WORD_COUNT, metadata.get(Metadata.WORD_COUNT), properties);
       return properties;
    }
 
Example 11
Project: alfresco-repository   File: MailMetadataExtracter.java   Source Code and License 6 votes vote down vote up
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
   putRawValue(KEY_ORIGINATOR, metadata.get(Metadata.AUTHOR), properties);
   putRawValue(KEY_SUBJECT, metadata.get(Metadata.TITLE), properties);
   putRawValue(KEY_DESCRIPTION, metadata.get(Metadata.SUBJECT), properties);
   putRawValue(KEY_SENT_DATE, metadata.get(Metadata.LAST_SAVED), properties);
   
   // Store the TO, but not cc/bcc in the addressee field
   putRawValue(KEY_ADDRESSEE, metadata.get(Metadata.MESSAGE_TO), properties); 
   
   // Store each of To, CC and BCC in their own fields
   putRawValue(KEY_TO_NAMES, metadata.getValues(Metadata.MESSAGE_TO), properties);
   putRawValue(KEY_CC_NAMES, metadata.getValues(Metadata.MESSAGE_CC), properties);
   putRawValue(KEY_BCC_NAMES, metadata.getValues(Metadata.MESSAGE_BCC), properties);
   
   // But store all email addresses (to/cc/bcc) in the addresses field
   putRawValue(KEY_ADDRESSEES, metadata.getValues(Metadata.MESSAGE_RECIPIENT_ADDRESS), properties); 
   
   return properties;
}
 
Example 12
Project: alfresco-repository   File: MP3MetadataExtracter.java   Source Code and License 6 votes vote down vote up
@SuppressWarnings("deprecation")
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
   // Do the normal Audio mappings
   super.extractSpecific(metadata, properties, headers);
   
   // Now do the compatibility ones
   // We only need these for people who had pre-existing mapping
   //  properties from before the proper audio model was added
   putRawValue(KEY_ALBUM_TITLE, metadata.get(XMPDM.ALBUM), properties);
   putRawValue(KEY_SONG_TITLE, metadata.get(Metadata.TITLE), properties);
   putRawValue(KEY_ARTIST, metadata.get(XMPDM.ARTIST), properties);
   putRawValue(KEY_COMMENT, metadata.get(XMPDM.LOG_COMMENT), properties);
   putRawValue(KEY_TRACK_NUMBER, metadata.get(XMPDM.TRACK_NUMBER), properties);
   putRawValue(KEY_GENRE, metadata.get(XMPDM.GENRE), properties);
   putRawValue(KEY_YEAR_RELEASED, metadata.get(XMPDM.RELEASE_DATE), properties);
   putRawValue(KEY_COMPOSER, metadata.get(XMPDM.COMPOSER), properties);

   // All done
   return properties;
}
 
Example 13
Project: alfresco-repository   File: TikaPoweredMetadataExtracter.java   Source Code and License 6 votes vote down vote up
private Serializable getMetadataValues(Metadata metadata, String key)
{
    // Use Set to prevent duplicates.
    Set<String> valuesSet = new LinkedHashSet<String>();
    String[] values = metadata.getValues(key);

    for (int i = 0; i < values.length; i++)
    {
        String[] parts = values[i].split(metadataSeparator);

        for (String subPart : parts)
        {
            valuesSet.add(subPart.trim());
        }
    }

    Object[] objArrayValues = valuesSet.toArray();
    values = Arrays.copyOf(objArrayValues, objArrayValues.length, String[].class);

    return values.length == 0 ? null : (values.length == 1 ? values[0] : values);
}
 
Example 14
Project: alfresco-repository   File: TikaPoweredMetadataExtracter.java   Source Code and License 6 votes vote down vote up
private String getMetadataValue(Metadata metadata, String key)
{
    if (metadata.isMultiValued(key))
    {
        String[] parts = metadata.getValues(key);
        
        // use Set to prevent duplicates
        Set<String> value = new LinkedHashSet<String>(parts.length);
        
        for (int i = 0; i < parts.length; i++)
        {
            value.add(parts[i]);
        }
        
        String valueStr = value.toString();
        
        // remove leading/trailing braces []
        return valueStr.substring(1, valueStr.length() - 1);
    }
    else
    {
        return metadata.get(key);
    }
}
 
Example 15
Project: alfresco-repository   File: TikaAutoMetadataExtracter.java   Source Code and License 6 votes vote down vote up
/**
 * Because some editors use JPEG_IMAGE_HEIGHT_TAG when
 * saving JPEG images , a more reliable source for
 * image size are the values provided by Tika
 * and not the exif/tiff metadata read from the file
 * This will override the tiff:Image size 
 * which gets embedded into the alfresco node properties
 * for jpeg files that contain such exif information
 */
@Override
protected Map<String, Serializable> extractSpecific(Metadata metadata,
     Map<String, Serializable> properties, Map<String,String> headers) 
{
    
    if(MimetypeMap.MIMETYPE_IMAGE_JPEG.equals(metadata.get(Metadata.CONTENT_TYPE)))                
    {
        //check if the image has exif information
        if(metadata.get(EXIF_IMAGE_WIDTH_TAG) != null && metadata.get(EXIF_IMAGE_HEIGHT_TAG) != null )
        {    
            //replace the exif size properties that will be embedded in the node with
            //the guessed dimensions from Tika
            putRawValue(TIFF.IMAGE_LENGTH.getName(), extractSize(metadata.get(JPEG_IMAGE_HEIGHT_TAG)), properties);
            putRawValue(TIFF.IMAGE_WIDTH.getName(), extractSize(metadata.get(JPEG_IMAGE_WIDTH_TAG)), properties);
        }
    }
    return properties;
}
 
Example 16
Project: alfresco-repository   File: DWGMetadataExtracterTest.java   Source Code and License 6 votes vote down vote up
@SuppressWarnings("deprecation")
@Override
public void setUp() throws Exception
{
    super.setUp();
    extracter = new DWGMetadataExtracter();
    extracter.setDictionaryService(dictionaryService);
    extracter.register();
    
    // Attach some extra mappings, using the Tika
    //  metadata keys namespace
    // These will be tested later
    HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
          extracter.getMapping()
    );
    
    Set<QName> tlaSet = new HashSet<QName>();
    tlaSet.add(TIKA_LAST_AUTHOR_TEST_PROPERTY);
    Set<QName> custSet = new HashSet<QName>();
    custSet.add(TIKA_CUSTOM_TEST_PROPERTY);
    newMap.put( Metadata.LAST_AUTHOR, tlaSet );
    newMap.put( TIKA_CUSTOM_KEY, custSet );
    
    extracter.setMapping(newMap);
}
 
Example 17
Project: alfresco-repository   File: TikaAutoMetadataExtracterTest.java   Source Code and License 6 votes vote down vote up
@Override
public void setUp() throws Exception
{
    super.setUp();
    
    TikaConfig config = (TikaConfig)ctx.getBean("tikaConfig");
    extracter = new TikaAutoMetadataExtracter(config);
    extracter.setDictionaryService(dictionaryService);
    MetadataExtracterConfig metadataExtracterConfig = (MetadataExtracterConfig)ctx.getBean("metadataExtracterConfig");
    extracter.setMetadataExtracterConfig(metadataExtracterConfig);
    extracter.register();
    
    // Attach some extra mappings, using the Tika
    //  metadata keys namespace
    // These will be tested later
    HashMap<String, Set<QName>> newMap = new HashMap<String, Set<QName>>(
          extracter.getMapping()
    );
    
    Set<QName> tlaSet = new HashSet<QName>();
    tlaSet.add(TIKA_MIMETYPE_TEST_PROPERTY);
    newMap.put( Metadata.CONTENT_TYPE, tlaSet );
    
    extracter.setMapping(newMap);
}
 
Example 18
Project: alfresco-repository   File: ContentMetadataExtracterTagMappingTest.java   Source Code and License 6 votes vote down vote up
@SuppressWarnings("unchecked")
public Map<String, Serializable> extractRaw(ContentReader reader) throws Throwable
{
    Map<String, Serializable> rawMap = super.extractRaw(reader);
    
    // Add some test keywords to those actually extracted from the file including a nodeRef
    List<String> keywords = new ArrayList<String>(Arrays.asList(
            new String[] { existingTagNodeRef, TAG_2, TAG_3, TAG_NONEXISTENT_NODEREF }));
    Serializable extractedKeywords = rawMap.get(Metadata.KEYWORDS);
    if (extractedKeywords != null && extractedKeywords instanceof String)
    {
        keywords.add((String) extractedKeywords);
    }
    else if (extractedKeywords != null && extractedKeywords instanceof Collection<?>)
    {
        keywords.addAll((Collection<? extends String>) extractedKeywords);
    }
    putRawValue(Metadata.KEYWORDS, (Serializable) keywords, rawMap);
    return rawMap;
}
 
Example 19
Project: gate-core   File: TikaFormat.java   Source Code and License 6 votes vote down vote up
private void setTikaFeature(Metadata metadata, Property property, FeatureMap fmap) {
  String value = metadata.get(property);
  if (value == null) {
    return;
  }

  value = value.trim();
  if (value.length() == 0) {
    return;
  }
  String key = property.getName().toUpperCase();
  if (fmap.containsKey(key)) {
    fmap.put("TIKA_" + key, value);
  }
  else {
    fmap.put(key, value);
    fmap.put("TIKA_" + key, value);
  }		
}
 
Example 20
Project: Equella   File: PdfExtracter.java   Source Code and License 6 votes vote down vote up
@Override
public void extractText(String mimeType, InputStream input, StringBuilder outputText, int maxSize)
	throws IOException
{
	WriteOutContentHandler wrapped = new WriteOutContentHandler(maxSize);
	ContentHandler handler = new BodyContentHandler(wrapped);
	try
	{
		Metadata meta = new Metadata();
		Parser parser = new AutoDetectParser(new TikaConfig(getClass().getClassLoader()));
		parser.parse(input, handler, meta, new ParseContext());

		appendText(handler, outputText, maxSize);
	}
	catch( Exception t )
	{
		if( wrapped.isWriteLimitReached(t) )
		{
			// keep going
			LOGGER.debug("PDF size limit reached.  Indexing truncated text");
			appendText(handler, outputText, maxSize);
			return;
		}
		throw Throwables.propagate(t);
	}
}
 
Example 21
Project: gitplex-mit   File: UniversalEncodingDetector.java   Source Code and License 6 votes vote down vote up
public static Charset detect(InputStream input) throws IOException {
    input.mark(LOOKAHEAD);
    try {
        UniversalEncodingListener listener =
                new UniversalEncodingListener(new Metadata());

        byte[] b = new byte[BUFSIZE];
        int n = 0;
        int m = input.read(b);
        while (m != -1 && n < LOOKAHEAD && !listener.isDone()) {
            n += m;
            listener.handleData(b, 0, m);
            m = input.read(b, 0, Math.min(b.length, LOOKAHEAD - n));
        }

        return listener.dataEnd();
    } catch (IOException e) {
        throw e;
    } finally {
        input.reset();
    }
}
 
Example 22
Project: ctsms   File: ExecUtil.java   Source Code and License 6 votes vote down vote up
public static String getMimeType(byte[] data, String fileName) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(Metadata.RESOURCE_NAME_KEY, fileName);
	try {
		tikaStream = TikaInputStream.get(data, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 23
Project: ctsms   File: ExecUtil.java   Source Code and License 6 votes vote down vote up
public static String getMimeType(File file) throws Throwable {
	TikaInputStream tikaStream = null;
	Metadata metadata = new Metadata();
	metadata.add(Metadata.RESOURCE_NAME_KEY, file.getName());
	try {
		tikaStream = TikaInputStream.get(file, metadata);
		return detector.detect(tikaStream, metadata).toString();
	} catch (Throwable t) {
		throw t;
	} finally {
		if (tikaStream != null) {
			try {
				tikaStream.close();
			} catch (IOException e) {
			}
		}
	}
}
 
Example 24
Project: SimpleRESTClientHandler   File: FileManager.java   Source Code and License 6 votes vote down vote up
public String getMimeType() throws IOException
{
	TikaInputStream tikaIS = null;
	try
	{
		tikaIS = TikaInputStream.get(file);

		return new DefaultDetector(MimeTypes.getDefaultMimeTypes()).detect(tikaIS, new Metadata()).toString();
	} finally
	{
		if (tikaIS != null)
		{
			tikaIS.close();
		}
	}
}
 
Example 25
Project: service-authorization   File: AbstractUserReplicator.java   Source Code and License 6 votes vote down vote up
private String resolveContentType(byte[] data) {
    AutoDetectParser parser = new AutoDetectParser(new ImageParser());
    try {
        return parser.getDetector().detect(TikaInputStream.get(data), new Metadata()).toString();
    } catch (IOException e) {
        return MediaType.OCTET_STREAM.toString();
    }
}
 
Example 26
Project: beam   File: TikaIO.java   Source Code and License 6 votes vote down vote up
@Override
public void populateDisplayData(DisplayData.Builder builder) {
  super.populateDisplayData(builder);

  if (getTikaConfigPath() != null) {
    builder.add(
        DisplayData.item("tikaConfigPath", getTikaConfigPath()).withLabel("TikaConfig Path"));
  }
  Metadata metadata = getInputMetadata();
  if (metadata != null) {
    //TODO: use metadata.toString() only without a trim() once Apache Tika 1.17 gets released
    builder.add(
        DisplayData.item("inputMetadata", metadata.toString().trim())
            .withLabel("Input Metadata"));
  }
  builder.addIfNotNull(
      DisplayData.item("contentTypeHint", getContentTypeHint()).withLabel("Content type hint"));
}
 
Example 27
Project: mesh   File: ImgscalrImageManipulator.java   Source Code and License 6 votes vote down vote up
@Override
public Single<Map<String, String>> getMetadata(InputStream ins) {
	return Single.create(sub -> {
		Parser parser = new AutoDetectParser();
		BodyContentHandler handler = new BodyContentHandler();
		Metadata metadata = new Metadata();
		ParseContext context = new ParseContext();
		try {
			parser.parse(ins, handler, metadata, context);
			Map<String, String> map = new HashMap<>();
			String[] metadataNames = metadata.names();

			for (String name : metadataNames) {
				map.put(name, metadata.get(name));
			}

			sub.onSuccess(map);

		} catch (Exception e) {
			sub.onError(e);
		}
		// ins.close();
	});
}
 
Example 28
Project: theSemProject   File: SemEmbeddedDocumentsExtractor.java   Source Code and License 6 votes vote down vote up
/**
 * Processa i contenuti
 *
 * @since 1.1: aggiunto il trattamento dei TIFF
 * @param stream stream binario del contenuto
 * @param handler handler
 * @param metadata metadati del documento
 * @param outputHtml necessario per l'override del metodo ma mai usato
 * @throws SAXException eccezione
 * @throws IOException Eccezione di input/output
 */
@Override
public void parseEmbedded(InputStream stream, org.xml.sax.ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException {
    String name = "Content" + fileCount++;
    MediaType contentType = detector.detect(stream, metadata);
    if (contentType != null) {
        try {
            name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {
            LogGui.printException(e);
        }
    }
    byte[] bytes = IOUtils.toByteArray(stream);
    embedded.put(name, bytes);
    if (name.toLowerCase().endsWith("jpg") || name.toLowerCase().endsWith("tiff") || name.toLowerCase().endsWith("tif") || name.toLowerCase().endsWith("png") || name.toLowerCase().endsWith("gif")) {

        BufferedImage pl = ImageIO.read(new ByteArrayInputStream(bytes));
        if (pl != null) {
            if ((pl.getWidth() > 32 && pl.getHeight() > 32)) //No Icone
            {
                embeddedImages.put(name, pl);
            }
        }

    }
}
 
Example 29
Project: theSemProject   File: DocumentParser.java   Source Code and License 6 votes vote down vote up
/**
 * Ritorna una vista HTML sul file processato. L'estrazione del testo fa
 * perdere di fatto la formattazione contenuta nel word o nel PDF. La
 * versione HTML รจ una versione processabile ma che mantiene il formato
 *
 * @param file file da parsare
 * @return vista HTML formattato del contenuto del documento (privato delle
 * immagini)
 */
public String getHtmlFromFile(File file) {
    try {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
        TransformerHandler handler = factory.newTransformerHandler();
        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
        handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
        handler.setResult(new StreamResult(out));
        ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
        InputStream is = file.toURI().toURL().openStream();
        adp.parse(is, handler1, new Metadata());
        is.close();
        return new String(out.toByteArray(), "UTF-8").replaceAll("<img .*?</img>", "").replaceAll("<img .*?/>", "");
    } catch (TransformerConfigurationException | IllegalArgumentException | IOException | SAXException | TikaException ex) {
        return "!ERROR: " + ex.getLocalizedMessage();
    }
}
 
Example 30
Project: nexus-public   File: DefaultMimeSupport.java   Source Code and License 6 votes vote down vote up
@Nonnull
@Override
public List<String> detectMimeTypes(final InputStream input, @Nullable final String fileName) throws IOException {
  checkNotNull(input);

  List<String> detected = Lists.newArrayList();
  Metadata metadata = new Metadata();
  if (fileName != null) {
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
  }

  MediaType mediaType;
  try (final TikaInputStream tis = TikaInputStream.get(input)) {
    mediaType = detector.detect(tis, metadata);
  }

  // unravel to least specific
  unravel(detected, mediaType);

  if (detected.isEmpty()) {
    detected.add(MimeTypes.OCTET_STREAM);
  }

  return detected;
}
 
Example 31
Project: bygle-ldp   File: FileInfoReader.java   Source Code and License 6 votes vote down vote up
public static TreeMap<String, String> extractMetaData(InputStream input) throws IOException {
	TreeMap<String, String> treeMap = new TreeMap<String, String>();
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			treeMap.put(name, stripWhiteSpace(metadata.get(name)));
		}
	} catch (Exception e) {
		e.printStackTrace();
	} finally {
		if (input != null)
			input.close();
	}
	return treeMap;
}
 
Example 32
Project: bygle-ldp   File: FileInfoReader.java   Source Code and License 6 votes vote down vote up
public static String extractStringMetaData(InputStream input) throws IOException {
	String result="";
	try {
		ContentHandler handler = new DefaultHandler();
		Metadata metadata = new Metadata();
		Parser parser = new AutoDetectParser();
		parser.parse(input, handler, metadata, new ParseContext());
		for (int i = 0; i < metadata.names().length; i++) {
			String name = metadata.names()[i];
			result+=name.toUpperCase()+" : "+stripWhiteSpace(metadata.get(name))+"\n";
		}
	} catch (Exception e) {
	} finally {
		if (input != null)
			input.close();
	}
	return result;
}
 
Example 33
Project: solr-hadoop-common   File: TikaParsing.java   Source Code and License 6 votes vote down vote up
public static LWDocument generateDocument(LWDocument document, Metadata metadata, ContentHandler content) {
  if (metadata != null) {
    for (String name : metadata.names()) {
      String value = metadata.get(name);
      document.addField(name, value);
    }
  }
  if (content != null) {
    String body = content.toString();
    if (body.length() > MAX_TERM_LENGTH_UTF) {
      body = body.substring(0, MAX_TERM_LENGTH_UTF);
    }
    document.addField("body", body);
  }
  return document;
}
 
Example 34
Project: search   File: SolrCellMorphlineTest.java   Source Code and License 6 votes vote down vote up
/**
 * Test that the ContentHandler properly strips the illegal characters
 */
@Test
public void testTransformValue() {
  String fieldName = "user_name";
  assertFalse("foobar".equals(getFoobarWithNonChars()));

  Metadata metadata = new Metadata();
  // load illegal char string into a metadata field and generate a new document,
  // which will cause the ContentHandler to be invoked.
  metadata.set(fieldName, getFoobarWithNonChars());
  StripNonCharSolrContentHandlerFactory contentHandlerFactory =
    new StripNonCharSolrContentHandlerFactory(DateUtil.DEFAULT_DATE_FORMATS);
  IndexSchema schema = h.getCore().getLatestSchema();
  SolrContentHandler contentHandler =
    contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
  SolrInputDocument doc = contentHandler.newDocument();
  String foobar = doc.getFieldValue(fieldName).toString();
  assertTrue("foobar".equals(foobar));
}
 
Example 35
Project: storm-trident-example   File: TikaParserTest.java   Source Code and License 6 votes vote down vote up
public static void main(String[] args) {
	String[] urls = {"http://t.co/hP5PM6fm", "http://t.co/xSFteG23"};
	for (String url : urls)
	{
		try {
	          Parser parser = new AutoDetectParser();
	          Metadata metadata = new Metadata();
	          ParseContext parseContext = new ParseContext();
	          URL urlObject = new URL(url);
	          ContentHandler handler = new BodyContentHandler(10 *
	                                   1024 * 1024);
	          parser.parse((InputStream) urlObject.getContent(),
	                         handler, metadata, parseContext);
	          String[] mimeDetails = metadata.get("Content-Type")
	                                              .split(";");
	          logger.info("execute: url = "+url+", mimeDetails = "+Arrays.asList(mimeDetails));
	          
	          
	        } catch (Exception ex) {
	        	ex.printStackTrace();
	        }
	}
       
}
 
Example 36
Project: extract   File: PrintStreamSpewer.java   Source Code and License 6 votes vote down vote up
@Override
public void writeMetadata(final Document document) throws IOException {
	final Metadata metadata = document.getMetadata();

	// Set the path field.
	if (null != fields.forPath()) {
		stream.println(fields.forPath() + ": " + document.getPath().toString());
	}

	// Set the parent path field.
	if (null != fields.forParentPath() && document.getPath().getNameCount() > 1) {
		stream.println(fields.forParentPath() + ": " + document.getPath().getParent().toString());
	}

	new MetadataTransformer(metadata, fields).transform((name, value)-> stream.println(name + ": " + value),
			(name, values)-> stream.println(name + ": " + String.join(", ", values)));

	// Add an extra newline to signify the end of the metadata.
	stream.println();
}
 
Example 37
Project: spring-usc   File: GenericRDFGenerator.java   Source Code and License 6 votes vote down vote up
private InputType getInputType(Metadata metadata) {
	String[] contentType = metadata.get(Metadata.CONTENT_TYPE).split(";");
	switch (contentType[0]) {
		case "application/json" : {
			return InputType.JSON;
		}
		case "application/xml": {
			return InputType.XML;
		}
		case "text/csv": {
			return InputType.CSV;
		}
		case "text/excel": {
			return InputType.EXCEL;
		}
		case "text/x-excel": {
			return InputType.EXCEL;
		}
	}
	return null;
}
 
Example 38
Project: extract   File: EmbedSpawner.java   Source Code and License 6 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
Example 39
Project: extract   File: SQLDocumentQueueCodec.java   Source Code and License 6 votes vote down vote up
@Override
public Map<String, Object> encodeValue(final Object o) {
	final Document document = (Document) o;
	final Map<String, Object> map = new HashMap<>();

	if (null != idKey) {
		map.put(idKey, document.getId());
	}

	if (null != foreignIdKey) {
		map.put(foreignIdKey, document.getForeignId());
	}

	map.put(pathKey, document.getPath().toString());
	map.put(statusKey, waitingStatus);

	final String size = document.getMetadata().get(Metadata.CONTENT_LENGTH);

	if (null != size) {
		map.put(sizeKey, Long.valueOf(size));
	}

	return map;
}
 
Example 40
Project: extract   File: SpewerTest.java   Source Code and License 6 votes vote down vote up
@Test
public void testWritesISO8601Dates() throws IOException {
	final SpewerStub spewer = new SpewerStub();
	final Document document = factory.create("test.txt");
	final Metadata metadata = document.getMetadata();
	final FieldNames fields = spewer.getFields();

	// TODO: this should go in a separate test for the MetadataTransformer.
	final String[] dates = {"2011-12-03+01:00", "2015-06-03", "Tue Jan 27 17:03:21 2004", "19106-06-07T08:00:00Z"};
	final String[] isoDates = {"2011-12-03T12:00:00Z", "2015-06-03T12:00:00Z", "2004-01-27T17:03:21Z",
			"+19106-06-07T08:00:00Z"};
	int i = 0;

	for (String date: dates) {
		metadata.set(Office.CREATION_DATE, date);
		spewer.writeMetadata(document);

		Assert.assertEquals(date, spewer.metadata.get(fields.forMetadata(Office.CREATION_DATE.getName())));
		Assert.assertEquals(isoDates[i++],
				spewer.metadata.get(fields.forMetadataISODate(Office.CREATION_DATE.getName())));

		// Reset the store of written metadata on each iteration.
		spewer.close();
	}
}