org.apache.tika.sax.BodyContentHandler Java Examples

The following examples show how to use org.apache.tika.sax.BodyContentHandler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example #2
Source File: TikaExtractor.java    From ache with Apache License 2.0 6 votes vote down vote up
public ParsedData parse(InputStream stream, String fileName, String contentType) {
    BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS);
    BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE);
    Metadata metadata = createMetadata(fileName, contentType);
    ParseContext context = new ParseContext();
    try {
        parser.parse(stream, textHandler, metadata, context);
        
        Map<String, String> metadataMap = new HashMap<String, String>();
        for (String propertyName : metadata.names()) {
            metadataMap.put(propertyName, metadata.get(propertyName));
        }
        
        return new ParsedData(handler.toString(), metadataMap);
        
    } catch (IOException | SAXException | TikaException e) {
        logger.error("Failed to extract metadata using Tika.", e);
        return null;
    }
}
 
Example #3
Source File: TikaContentExtractor.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}
 
Example #4
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example #5
Source File: EmbedSpawner.java    From extract with MIT License 6 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {

	// There's no need to spawn inline embeds, like images in PDFs. These should be concatenated to the main
	// document as usual.
	if (TikaCoreProperties.EmbeddedResourceType.INLINE.toString().equals(metadata
			.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
		final ContentHandler embedHandler = new EmbeddedContentHandler(new BodyContentHandler(handler));

		if (outputHtml) {
			writeStart(handler, metadata);
		}

		delegateParsing(input, embedHandler, metadata);

		if (outputHtml) {
			writeEnd(handler);
		}
	} else {
		try (final TikaInputStream tis = TikaInputStream.get(input)) {
			spawnEmbedded(tis, metadata);
		}
	}
}
 
Example #6
Source File: EmbedParser.java    From extract with MIT License 5 votes vote down vote up
@Override
public void parseEmbedded(final InputStream input, final ContentHandler handler, final Metadata metadata,
                          final boolean outputHtml) throws SAXException, IOException {
	if (outputHtml) {
		writeStart(handler, metadata);
	}

	delegateParsing(input, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata);

	if (outputHtml) {
		writeEnd(handler);
	}
}
 
Example #7
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}
 
Example #8
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}
 
Example #9
Source File: AttachAttribute.java    From entando-components with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}
 
Example #10
Source File: JATEUtil.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}
 
Example #11
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example #12
Source File: PdfParser.java    From superword with Apache License 2.0 5 votes vote down vote up
/**
 * 将PDF文件解析为文本
 * @param file 本地PDF文件的相对路径或绝对路径
 * @return 提取的文本
 */
public static String parsePdfFileToPlainText(String file) {
    try(InputStream stream = new FileInputStream(file)) {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        PARSER.parse(stream, handler, metadata);
        return handler.toString();
    } catch (Exception e){
        e.printStackTrace();
    }
    return "";
}
 
Example #13
Source File: EmbeddedDocumentMemoryExtractor.java    From extract with MIT License 5 votes vote down vote up
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}
 
Example #14
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
/**
 * Basic text extraction.
 * <p>
 * Tries to close input stream after processing.
 */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
    ContentHandler handler = new BodyContentHandler(1000000);
    try {
        parser.parse(is, handler, metadata, context);
    } finally {
        is.close();
    }
    return handler.toString();
}
 
Example #15
Source File: WidgetMacroLibraryTests.java    From scipio-erp with Apache License 2.0 5 votes vote down vote up
public void testFopMacroLibrary() throws Exception {
    String screentextUrl = screenUrl.concat("Fop");
    HttpClient http = initHttpClient();
    http.setUrl(screentextUrl.concat(authentificationQuery));
    //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace
    InputStream screenInputStream = (InputStream) http.postStream();
    assertNotNull("Response failed from ofbiz", screenInputStream);
    assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType());

    String screenOutString = "";
    try {
        BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
        Metadata metadata = new Metadata();
        new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext());
        screenOutString = handler.toString();
    } finally {
        screenInputStream.close();
    }
    //Test if a ftl macro error is present
    assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:"));
}
 
Example #16
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringNotRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } finally {
        stream.close();
    }
    assertTrue(body.toString().contains("An Example Paper"));
}
 
Example #17
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedWordDoc() throws Exception {
    System.out.println("testEncryptedWordDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("Word doc Encrypted"));
}
 
Example #18
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedPDFDoc() throws Exception {
    System.out.println("testEncryptedPDFDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("PDF Encrypted"));
}
 
Example #19
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testMassiveOCRDoc() throws Exception {
    System.out.println("testMassiveOCRDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using"));
}
 
Example #20
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    String parsedString = body.toString();
    // From first page
    assertTrue(parsedString.contains("Father or mother"));
    // From second (last) page
    assertTrue(parsedString.contains("how you have determined who is the Nearest"));
}
 
Example #21
Source File: TikaPoweredContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Returns an appropriate Tika ContentHandler for the
 *  requested content type. Normally you'll let this
 *  work as default, but if you need fine-grained
 *  control of how the Tika events become text then
 *  override and supply your own.
 */
protected ContentHandler getContentHandler(String targetMimeType, Writer output) 
               throws TransformerConfigurationException
{
   if(MimetypeMap.MIMETYPE_TEXT_PLAIN.equals(targetMimeType)) 
   {
      return new BodyContentHandler(output);
   }
   
   SAXTransformerFactory factory = (SAXTransformerFactory)
         SAXTransformerFactory.newInstance();
   TransformerHandler handler = factory.newTransformerHandler();
   handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
   handler.setResult(new StreamResult(output));
   
   if(MimetypeMap.MIMETYPE_HTML.equals(targetMimeType))
   {
      handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
      return new ExpandedTitleContentHandler(handler);
   }
   else if(MimetypeMap.MIMETYPE_XHTML.equals(targetMimeType) ||
           MimetypeMap.MIMETYPE_XML.equals(targetMimeType))
   {
      handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
   }
   else
   {
      throw new TransformerInfoException(
            WRONG_FORMAT_MESSAGE_ID,
            new IllegalArgumentException("Requested target type " + targetMimeType + " not supported")
      );
   }
   return handler;
}
 
Example #22
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}
 
Example #23
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public String toText(String filePath) throws OperationException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }
}
 
Example #24
Source File: ParsingReader.java    From extract with MIT License 4 votes vote down vote up
public ParsingReader(final Parser parser, final InputStream input, final Metadata metadata, final ParseContext
		context) throws IOException {
	this(parser, input, metadata, context, BodyContentHandler::new);
}
 
Example #25
Source File: TikaEntityProcessor.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private static ContentHandler getTextContentHandler(Writer writer) {
  return new BodyContentHandler(writer);
}
 
Example #26
Source File: DocUtils.java    From geoportal-server-harvester with Apache License 2.0 4 votes vote down vote up
public static byte[] generateMetadataXML(byte[] file_bytes, String file_name) throws IOException {
	
	// Input & Output Variables
	ByteArrayInputStream base_input = new ByteArrayInputStream(file_bytes);
	byte[]               xml_bytes  = null;
	
	// Tika Parser Objects
    Parser               parser     = new AutoDetectParser();
    BodyContentHandler   handler    = new BodyContentHandler();
    Metadata             metadata   = new Metadata();
    ParseContext         context    = new ParseContext();
	  
    try {
    	// Populate Metadata Object with Tika Parser
    	parser.parse(base_input, handler, metadata, context);
    	
    	// Container & Writer for Metadata
    	Properties   meta_props = new Properties();
    	StringWriter sw         = new StringWriter();
    	
    	// Put Tika Metadata in Properties
    	for(String name : metadata.names()) {
    		if (!metadata.get(name).isEmpty()) {
    			meta_props.put(name, metadata.get(name));
    		}
    	}
    	meta_props.store(sw, "Tika Values");

    	// Expected Harvester Properties
    	String     meta_descr  = metadata.get(TikaCoreProperties.DESCRIPTION);
    	String     meta_modif  = metadata.get(TikaCoreProperties.MODIFIED);
    	String     meta_title  = metadata.get(TikaCoreProperties.TITLE);
    	
    	// Default Label for Undefined Tika Properties
    	DateFormat date_format = new SimpleDateFormat("yyyy/MM/dd");
    	Date       date        = new Date();
    	String     date_today  = date_format.format(date);
    	String     tika_label  = String.format("TIKA_%s", date_today);
    	
    	// Check For Null Values & Set Defaults
    	if (meta_descr == null) {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, "" + sw.toString());
    	} else {
    		meta_props.put(WKAConstants.WKA_DESCRIPTION, meta_descr);
    	}
    	
    	if (meta_modif == null) {
    		meta_props.put(WKAConstants.WKA_MODIFIED, tika_label);
    	} else {
    		meta_props.put(WKAConstants.WKA_MODIFIED, meta_modif);
    	}
    	
    	if (meta_title == null) {
    		meta_props.put(WKAConstants.WKA_TITLE, file_name);
    	} else {
    		meta_props.put(WKAConstants.WKA_TITLE, meta_title);
    	}
 	
    	// Build XML as Bytes
    	MapAttribute attr = AttributeUtils.fromProperties(meta_props);
		Document document = new SimpleDcMetaBuilder().create(attr);
		xml_bytes = XmlUtils.toString(document).getBytes("UTF-8");
    		
    } catch (Exception ex) {
      LOG.error(String.format("Error reading data."), ex);
    } finally {
    	base_input.close();
    }
	
	return xml_bytes;
	
}
 
Example #27
Source File: PDFPreprocessorParser.java    From CogStack-Pipeline with Apache License 2.0 4 votes vote down vote up
@Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
            throws IOException, SAXException, TikaException {
        ImageMagickConfig config = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);

        // If ImageMagick is not on the path with the current config, do not try to run OCR
        // getSupportedTypes shouldn't have listed us as handling it, so this should only
        //  occur if someone directly calls this parser, not via DefaultParser or similar
//        TemporaryResources tmp = new TemporaryResources();
        //TikaInputStream pdfStream = TikaInputStream.get(stream);
        PDFParser pdfParser = new PDFParser();

        //create temp handlers to investigate object
        BodyContentHandler body = new BodyContentHandler();
        Metadata pdfMetadata = new Metadata();

        //needed to reset stream
        if (stream.markSupported()) {
            stream.mark(Integer.MAX_VALUE);
        }

        //first do initial parse to see if there's subsantial content in pdf metadata already
        pdfParser.parse(stream, body, pdfMetadata, context);
        stream.reset();
        //if there's content - reparse with official handlers/metadata. What else can you do? Also check imagemagick is available

        if (body.toString().length() > 100 || !hasImageMagick(config)) {
            pdfParser.parse(stream, handler, metadata, context);
            metadata.set("X-PDFPREPROC-OCR-APPLIED", "NA");
            return;
        }

        metadata.set("X-PDFPREPROC-ORIGINAL", body.toString());
        metadata.set("X-PDFPREPROC-OCR-APPLIED", "FAIL");
        // "FAIL" will be overwritten if it succeeds later

        //add the PDF metadata to the official metadata object
        Arrays.asList(pdfMetadata.names()).stream().forEach(name -> {
            metadata.add(name, pdfMetadata.get(name));
        });


        //objects to hold file references for manipulation outside of Java
        File tiffFileOfPDF = null;
        File pdfFileFromStream = File.createTempFile("tempPDF", ".pdf");
        try {

            FileUtils.copyInputStreamToFile(stream, pdfFileFromStream);
            tiffFileOfPDF = File.createTempFile("tempTIFF", ".tiff");
            makeTiffFromPDF(pdfFileFromStream,tiffFileOfPDF, config);
            if (tiffFileOfPDF.exists()) {
                long tessStartTime = System.currentTimeMillis();
                TesseractOCRParser tesseract = new TesseractOCRParser();

                tesseract.parse(FileUtils.openInputStream(tiffFileOfPDF), handler, metadata, context);
                metadata.set("X-PDFPREPROC-OCR-APPLIED", "SUCCESS");

                LOG.debug("Document parsing -- OCR processing time: {} ms", System.currentTimeMillis() - tessStartTime);
            }
        } finally {
            if (tiffFileOfPDF.exists()) {
                tiffFileOfPDF.delete();
            }
            if (pdfFileFromStream.exists()) {
                pdfFileFromStream.delete();
            }
        }
    }
 
Example #28
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}
 
Example #29
Source File: MP3Reader.java    From red5-io with Apache License 2.0 4 votes vote down vote up
/**
 * Creates reader from file input stream
 * 
 * @param file
 *            file input
 * @throws IOException
 *             on IO error
 */
public MP3Reader(File file) throws IOException {
    this.file = file;
    fis = new FileInputStream(file);
    try {
        // parse the ID3 info
        BodyContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        // MP3 parser
        Mp3Parser parser = new Mp3Parser();
        parser.parse(fis, handler, metadata, null);
        log.debug("Contents of the document: {}", handler.toString());
        // create meta data holder
        metaData = new MetaData();
        String val = null;
        String[] metadataNames = metadata.names();
        for (String name : metadataNames) {
            val = metadata.get(name);
            log.debug("Meta name: {} value: {}", name, val);
            if ("xmpDM:artist".equals(name)) {
                metaData.setArtist(val);
            } else if ("xmpDM:album".equals(name)) {
                metaData.setAlbum(val);
            } else if ("title".equals(name)) {
                metaData.setSongName(val);
            } else if ("xmpDM:genre".equals(name)) {
                metaData.setGenre(val);
            } else if ("xmpDM:logComment".equals(name)) {
                metaData.setComment(val);
            } else if ("xmpDM:trackNumber".equals(name)) {
                metaData.setTrack(val);
            } else if ("xmpDM:releaseDate".equals(name)) {
                metaData.setYear(val);
            } else if ("xmpDM:duration".equals(name) || "duration".equals(name)) {
                metaData.setDuration(val);
            } else if ("xmpDM:audioSampleRate".equals(name) || "samplerate".equals(name)) {
                metaData.setSampleRate(val);
            } else if ("channels".equals(name)) {
                metaData.setChannels(val);
            }
        }
        /*
         * //send album image if included List<Artwork> tagFieldList = idTag.getArtworkList(); if (tagFieldList == null || tagFieldList.isEmpty()) { log.debug("No cover art was found"); }
         * else { Artwork imageField = tagFieldList.get(0); log.debug("Picture type: {}", imageField.getPictureType()); FrameBodyAPIC imageFrameBody = new FrameBodyAPIC();
         * imageFrameBody.setImageData(imageField.getBinaryData()); if (!imageFrameBody.isImageUrl()) { byte[] imageBuffer = (byte[])
         * imageFrameBody.getObjectValue(DataTypes.OBJ_PICTURE_DATA); //set the cover image on the metadata metaData.setCovr(imageBuffer); // Create tag for onImageData event IoBuffer buf
         * = IoBuffer.allocate(imageBuffer.length); buf.setAutoExpand(true); Output out = new Output(buf); out.writeString("onImageData"); Map<Object, Object> props = new HashMap<Object,
         * Object>(); props.put("trackid", 1); props.put("data", imageBuffer); out.writeMap(props); buf.flip(); //Ugh i hate flash sometimes!! //Error #2095: flash.net.NetStream was unable
         * to invoke callback onImageData. ITag result = new Tag(IoConstants.TYPE_METADATA, 0, buf.limit(), null, 0); result.setBody(buf); //add to first frames firstTags.add(result); } }
         * } else { log.info("File did not contain ID3v2 data: {}", file.getName()); }
         */
    } catch (Exception e) {
        log.error("MP3Reader {}", e);
    }
    // ensure we have a valid sample rate
    checkValidHeader();
    // get the total bytes / file size
    fileSize = file.length();
    log.debug("File size: {}", fileSize);
    // analyze keyframes data
    analyzeKeyFrames();
    // create file metadata object
    firstTags.addFirst(createFileMeta());
    log.trace("File input stream - open: {} position: {}", fis.getChannel().isOpen(), fis.getChannel().position());
    // create a channel for reading
    fileChannel = fis.getChannel();
}