org.apache.tika.parser.AutoDetectParser Java Examples

The following examples show how to use org.apache.tika.parser.AutoDetectParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
public static String extractMeta(String uri, String contentType) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();

	fillMetadata(parser, metadata, contentType, uri);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	parser.parse(inputStream, new DefaultHandler(), metadata);

	Map meta = new HashMap();
	for (String name : metadata.names()) {
		String[] values = metadata.getValues(name);
		meta.put(name, values);
	}

	inputStream.close();

	return new Gson().toJson(meta);
}
 
Example #2
Source File: FileParserSingleton.java    From scava with Eclipse Public License 2.0 6 votes vote down vote up
private FileParserSingleton()
{
	logger = (OssmeterLogger) OssmeterLogger.getLogger("nlp.tools.preprocessor.fileparser");
	BufferedReader fileList;
	try
	{
		fileList = loadFile();
		readSupportedFilesList(fileList);
		logger.info("List of supported files has been sucessfully loaded");
		parser = new AutoDetectParser();
	}
	catch (IOException  e) 
	{
		logger.error("Error while loading the List of supported files:", e);
		e.printStackTrace();
	}	
}
 
Example #3
Source File: TikaDocumentItemProcessor.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@PostConstruct
public void init() throws IOException, SAXException, TikaException{
    setFieldName(tikaFieldName);

    // load tika configuration
    tikaConfig = new TikaConfig(this.getClass().getClassLoader()
                            .getResourceAsStream("tika-config.xml"));

    // load tesseract ocr configuration
    tesseractConfig = new TesseractOCRConfig();
    if (tesseractTimeout > 0) {
        tesseractConfig.setTimeout(tesseractTimeout);
    }

    // load image magick configuration -- used for tiff conversion
    imgConfig = new ImageMagickConfig();
    if (convertTimeout > 0) {
        imgConfig.setTimeout(convertTimeout);
    }

    parser = new AutoDetectParser(tikaConfig);
}
 
Example #4
Source File: TikaUtil.java    From scipio-erp with Apache License 2.0 6 votes vote down vote up
/**
 * Finds media type (through Apache Tika library), based on filename and magic numbers.
 * @throws IOException
 */
public static MediaType findMediaType(InputStream is, String fileName) throws IOException {
    BufferedInputStream bis = new BufferedInputStream(is);
    try {
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);
        MediaType mediaType = detector.detect(bis, md);
        return mediaType;
    } finally {
        try {
            bis.close();
        } catch (IOException e) {
            ;
        }
    }
}
 
Example #5
Source File: TikaContentExtractor.java    From baleen with Apache License 2.0 6 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    jCas.setDocumentText(textHandler.toString());

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
    if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
      jCas.setDocumentText(CORRUPT_FILE_TEXT);
    }
  }
}
 
Example #6
Source File: FTConnector.java    From openprodoc with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
     *
     * @param Bytes
     * @return
     * @throws PDException
     */
protected String Convert(InputStream Bytes) throws PDException
{  
try {                
ContentHandler textHandler=new BodyContentHandler(-1);
Metadata metadata=new Metadata();
Parser parser=new AutoDetectParser();
ParseContext context=new ParseContext();
parser.parse(Bytes, textHandler, metadata, context);
FileMetadata="";
for (String key : metadata.names()) 
    FileMetadata+=key+"="+metadata.get(key)+"\n";
FullText=textHandler.toString();
} catch (Exception ex)
    {
    PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage());
    }

return(FullText); 
}
 
Example #7
Source File: HTMLRenderingEngine.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
@Override
protected void render(RenderingContext context)
{
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();
    
    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if(! p.getParsers().containsKey(sourceMediaType))
    {
       throw new RenditionServiceException(
             "Source mime type of " + sourceMimeType + 
             " is not supported by Tika for HTML conversions"
       );
    }
    
    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
}
 
Example #8
Source File: TikaAutoMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 6 votes vote down vote up
private static ArrayList<String> buildMimeTypes(TikaConfig tikaConfig)
{
   config = tikaConfig;
   parser = new AutoDetectParser(config);

   SUPPORTED_MIMETYPES = new ArrayList<String>();
   for(MediaType mt : parser.getParsers().keySet()) 
   {
      // Add the canonical mime type
      SUPPORTED_MIMETYPES.add( mt.toString() );
      
      // And add any aliases of the mime type too - Alfresco uses some
      //  non canonical forms of various mimetypes, so we need all of them
      for(MediaType alias : config.getMediaTypeRegistry().getAliases(mt)) 
      {
          SUPPORTED_MIMETYPES.add( alias.toString() );
      }
   }
   return SUPPORTED_MIMETYPES;
}
 
Example #9
Source File: ExtractingDocumentLoader.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                         TikaConfig config, ParseContextConfig parseContextConfig,
                                SolrContentHandlerFactory factory) {
  this.params = req.getParams();
  this.core = req.getCore();
  this.config = config;
  this.parseContextConfig = parseContextConfig;
  this.processor = processor;

  templateAdd = new AddUpdateCommand(req);
  templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
  templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

  //this is lightweight
  autoDetectParser = new AutoDetectParser(config);
  this.factory = factory;
  
  ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
 
Example #10
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example #11
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedWordDoc() throws Exception {
    System.out.println("testEncryptedWordDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("encryptedWordDocx.docx");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("Word doc Encrypted"));
}
 
Example #12
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringNotRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_nonOCR_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } finally {
        stream.close();
    }
    assertTrue(body.toString().contains("An Example Paper"));
}
 
Example #13
Source File: SimplePageParser.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
@Override
public void open(RuntimeContext context) throws Exception {
    super.open(context);

    _parser = new AutoDetectParser();
    _linkExtractor.setLinkTags(getParserPolicy().getLinkTags());
    _linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes());
}
 
Example #14
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example #15
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return metadata;
}
 
Example #16
Source File: TearlineContentExtractor.java    From baleen with Apache License 2.0 5 votes vote down vote up
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
  super.doProcessStream(stream, source, jCas);

  try {
    BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    AutoDetectParser autoParser = new AutoDetectParser();
    autoParser.parse(stream, textHandler, metadata, context);

    String fullContent = textHandler.toString();
    Matcher m = tearlinePattern.matcher(fullContent);
    if (m.find()) {
      jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
    } else {
      jCas.setDocumentText(removeBoilerplate(fullContent).trim());
    }

    for (String name : metadata.names()) {
      addMetadata(jCas, name, metadata.get(name));
    }
  } catch (SAXException | TikaException e) {
    getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
  }
}
 
Example #17
Source File: JATEUtil.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
public static String parseToPlainText(InputStream fileStream) {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    String rawContent = "";

    try {
        parser.parse(fileStream, handler, metadata);
        rawContent = handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        LOG.debug("Parsing Exception while extracting content from current file. "
                + e.toString());
    }
    return rawContent;
}
 
Example #18
Source File: AttachAttribute.java    From entando-components with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
public String getIndexeableFieldValue() {
	StringBuilder buffer = new StringBuilder();
	if (null != super.getIndexeableFieldValue()) {
		buffer.append(super.getIndexeableFieldValue());
	}
	String extraValue = null;
	ResourceInterface resource = this.getResource();
	if (resource != null) {
		InputStream is = ((AttachResource) resource).getResourceStream();
		if (null != is) {
			AutoDetectParser parser = new AutoDetectParser();
			BodyContentHandler handler = new BodyContentHandler(-1);
			Metadata metadata = new Metadata();
			try {
				parser.parse(is, handler, metadata);
				extraValue = handler.toString();
			} catch (Throwable t) {
				_logger.error("Error while processing the parsing", t);
			} finally {
				try {
					is.close();
				} catch (IOException ex) {
					_logger.error("Error closing stream", ex);
				}
			}
		}
	}
	if (null != extraValue) {
		buffer.append(" ").append(extraValue);
	}
	return buffer.toString();
}
 
Example #19
Source File: ExtractMediaMetadata.java    From nifi with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
@OnScheduled
public void onScheduled(ProcessContext context) {
    String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue();
    if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) {
        metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput));
    } else {
        metadataKeyFilterRef.set(null);
    }

    autoDetectParser = new AutoDetectParser();
}
 
Example #20
Source File: TikaAnalysis.java    From tutorials with MIT License 5 votes vote down vote up
public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();

    parser.parse(stream, handler, metadata, context);
    return handler.toString();
}
 
Example #21
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Ignore
@Test
public void testMassiveOCRDoc() throws Exception {
    System.out.println("testMassiveOCRDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("long_OCR_doc.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    assertTrue(body.toString().contains("Saliva-derived genomic DNA samples were genotyped using"));
}
 
Example #22
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testParseRequiringOCR() throws Exception {
    System.out.println("parse");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("tika/testdocs/pdf_ocr_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    parser.parse(stream, body, metadata);
    String parsedString = body.toString();
    // From first page
    assertTrue(parsedString.contains("Father or mother"));
    // From second (last) page
    assertTrue(parsedString.contains("how you have determined who is the Nearest"));
}
 
Example #23
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 5 votes vote down vote up
@Test
public void testEncryptedPDFDoc() throws Exception {
    System.out.println("testEncryptedPDFDoc");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("pdf_encrypted_test.pdf");
    AutoDetectParser parser = new AutoDetectParser(config);
    //PDFPreprocessorParser parser = new PDFPreprocessorParser();
    BodyContentHandler body = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try {
        parser.parse(stream, body, metadata);
    } catch (Exception ex) {
        //donowt
    }
    assertFalse(body.toString().contains("PDF Encrypted"));
}
 
Example #24
Source File: ArchiveContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}
 
Example #25
Source File: TikaPoweredContainerExtractor.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * Injects the TikaConfig to use
 * 
 * @param tikaConfig The Tika Config to use 
 */
public void setTikaConfig(TikaConfig tikaConfig)
{
    this.config = tikaConfig;
    
    // Setup the detector and parser
    detector = new DefaultDetector(config.getMimeRepository());
    parser = new AutoDetectParser(detector);
}
 
Example #26
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));

    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, metadata, context);
    }
    return handler.getMetadataList();
}
 
Example #27
Source File: TikaTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);

    RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, handler, new Metadata(), context);
    }
    return handler.getMetadataList();
}
 
Example #28
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public JSONObject toJson(String filePath) throws OperationException {

    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }

    final String fileText = handler.toString();
    if(fileText == null || fileText.isEmpty()) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Attempting to import an empty document");
    }

    JSONObject jsonObject = new JSONObject();
    jsonObject.put("_txt", fileText);

    String[] metadataNames = metadata.names();
    for(String name : metadataNames) {
        jsonObject.put(name, metadata.get(name));
    }

    return jsonObject;
}
 
Example #29
Source File: TikaAutoInterpreter.java    From db with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public String toText(String filePath) throws OperationException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = new FileInputStream(new File(filePath))) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    } catch (IOException | SAXException | TikaException e) {
        throw new OperationException(ErrorCode.UNRECOGNISED_DOCUMENT_FORMAT, "Could not auto-detect document for reading");
    }
}
 
Example #30
Source File: ExtractMediaMetadata.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unused")
@OnScheduled
public void onScheduled(ProcessContext context) {
    String metadataKeyFilterInput = context.getProperty(METADATA_KEY_FILTER).getValue();
    if (metadataKeyFilterInput != null && metadataKeyFilterInput.length() > 0) {
        metadataKeyFilterRef.set(Pattern.compile(metadataKeyFilterInput));
    } else {
        metadataKeyFilterRef.set(null);
    }

    autoDetectParser = new AutoDetectParser();
}