Java Code Examples for org.apache.tika.parser.Parser#parse()

The following examples show how to use org.apache.tika.parser.Parser#parse() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example 2
Source File: ImageConverter.java    From openmeetings with Apache License 2.0 6 votes vote down vote up
private static ProcessResult initSize(BaseFileItem f, File img, String mime) {
	ProcessResult res = new ProcessResult();
	res.setProcess("get image dimensions :: " + f.getId());
	final Parser parser = new ImageParser();
	try (InputStream is = new FileInputStream(img)) {
		Metadata metadata = new Metadata();
		metadata.set(CONTENT_TYPE, mime);
		parser.parse(is, new DefaultHandler(), metadata, new ParseContext());
		f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH)));
		f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH)));
		res.setExitCode(ZERO);
	} catch (Exception e) {
		log.error("Error while getting dimensions", e);
		res.setError("Error while getting dimensions");
		res.setException(e.getMessage());
		res.setExitCode(-1);
	}
	return res;
}
 
Example 3
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
/**
 * Basic text extraction.
 * <p>
 * Tries to close input stream after processing.
 */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new BodyContentHandler(1000000);
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}
 
Example 4
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
public String getTextWoDoublebreaks(InputStream is, Parser parser,
                                    ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new OriginalBodyContentHandler();
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}
 
Example 5
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example 6
Source File: CachingTesseractOCRParserTest.java    From extract with MIT License 5 votes vote down vote up
@Test
public void testWriteToCache() throws Throwable {
	final Path simple = Paths.get(this.simple.toURI());

	Writer writer = new StringWriter();
	final AtomicInteger hit = new AtomicInteger(), miss = new AtomicInteger();

	final Parser parser = new CachingTesseractOCRParser(tmpDir) {

		private static final long serialVersionUID = 6551690243986921730L;

		@Override
		public void cacheHit() {
			hit.incrementAndGet();
		}

		@Override
		public void cacheMiss() {
			miss.incrementAndGet();
		}
	};

	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(0, hit.get());
	Assert.assertEquals(1, miss.get());

	// Try again from the cache.
	writer = new StringWriter();
	try (final InputStream in = Files.newInputStream(simple)) {
		parser.parse(in, new WriteOutContentHandler(writer), new Metadata(), new ParseContext());
	}

	Assert.assertEquals("HEAVY\nMETAL", writer.toString().trim());
	Assert.assertEquals(1, hit.get());
	Assert.assertEquals(1, miss.get());
}
 
Example 7
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}
 
Example 8
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}
 
Example 9
Source File: HTMLRenderingEngine.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Asks Tika to translate the contents into HTML
 */
private void generateHTML(Parser p, RenderingContext context)
{
   ContentReader contentReader = context.makeContentReader();
   
   // Setup things to parse with
   StringWriter sw = new StringWriter();
   ContentHandler handler = buildContentHandler(sw, context);
   
   // Tell Tika what we're dealing with
   Metadata metadata = new Metadata();
   metadata.set(
         Metadata.CONTENT_TYPE, 
         contentReader.getMimetype()
   );
   metadata.set(
         Metadata.RESOURCE_NAME_KEY, 
         nodeService.getProperty( 
               context.getSourceNode(),
               ContentModel.PROP_NAME
         ).toString()
   );

   // Our parse context needs to extract images
   ParseContext parseContext = new ParseContext();
   parseContext.set(Parser.class, new TikaImageExtractingParser(context));
   
   // Parse
   try {
      p.parse(
            contentReader.getContentInputStream(),
            handler, metadata, parseContext
      );
   } catch(Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
   }
   
   // As a string
   String html = sw.toString();
   
   // If we're doing body-only, remove all the html namespaces
   //  that will otherwise clutter up the document
   boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
   if(bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
      html = html.replaceAll("&#13;","");
   }
   
   // Save it
   ContentWriter contentWriter = context.makeContentWriter();
   contentWriter.setMimetype("text/html");
   contentWriter.putContent( html );
}
 
Example 10
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}