Java Code Examples for org.apache.tika.parser.AutoDetectParser#parse()

The following examples show how to use org.apache.tika.parser.AutoDetectParser#parse() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TikaContentExtractor.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}
 
Example 2
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example 3
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public String toText(String filePath) throws OperationException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }
}
 
Example 4
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}
 
Example 5
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    String parsedString = body.toString();
    // From first page
    assertTrue(parsedString.contains("Father or mother"));
    // From second (last) page
    assertTrue(parsedString.contains("how you have determined who is the Nearest"));
}
 
Example 6
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testMassiveOCRDoc() throws Exception {
    System.out.println("testMassiveOCRDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using"));
}
 
Example 7
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedPDFDoc() throws Exception {
    System.out.println("testEncryptedPDFDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("PDF Encrypted"));
}
 
Example 8
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedWordDoc() throws Exception {
    System.out.println("testEncryptedWordDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("Word doc Encrypted"));
}
 
Example 9
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringNotRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } finally {
        stream.close();
    }
    assertTrue(body.toString().contains("An Example Paper"));
}
 
Example 10
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example 11
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example 12
Source File: JATEUtil.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}
 
Example 13
Source File: AttachAttribute.java    From entando-components with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}
 
Example 14
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}