Java Code Examples for org.apache.tika.parser.ParseContext#set()

The following examples show how to use org.apache.tika.parser.ParseContext#set() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: NodeTika.java    From node-tika with MIT License 6 votes vote down vote up
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) {
	final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();

	if (options == null) {

		// Disable OCR and return if no options are specified.
		disableOcr(ocrConfig);
		parseContext.set(TesseractOCRConfig.class, ocrConfig);

		return;
	}

	fillOcrOptions(ocrConfig, options);
	parseContext.set(TesseractOCRConfig.class, ocrConfig);

	final PDFParserConfig pdfParserConfig = new PDFParserConfig();
	fillPdfOptions(pdfParserConfig, options);
	parseContext.set(PDFParserConfig.class, pdfParserConfig);

	// Allow a password to be specified for encrypted files.
	fillPassword(parseContext, options);
}
 
Example 2
Source File: ContentExtractor.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException {
	WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength);
	try {
		ParseContext context = new ParseContext();
		context.set(Parser.class, txtParser);
		txtParser.parse(stream, new BodyContentHandler(handler), metadata, context);
	} catch (SAXException e) {
		if (!handler.isWriteLimitReached(e)) {
			// This should never happen with BodyContentHandler...
			throw new TikaException("Unexpected SAX processing failure", e);
		}
	} finally {
		stream.close();
	}
	return handler.toString();
}
 
Example 3
Source File: TikaCallable.java    From flink-crawler with Apache License 2.0 6 votes vote down vote up
/**
 * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the
 * default set.
 * 
 * @return
 */
private ParseContext makeParseContext() {
    ParseContext result = new ParseContext();

    Set<String> validTags = _linkExtractor.getLinkTags();
    HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE;
    for (String tag : validTags) {
        if (defaultMapper.mapSafeElement(tag) == null) {
            result.set(HtmlMapper.class,
                    new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes()));
            break;
        }
    }

    return result;
}
 
Example 4
Source File: PDFPreprocessorParserTest.java    From CogStack-Pipeline with Apache License 2.0 6 votes vote down vote up
@Test
public void offersNoTypesIfNotFound() throws Exception {
    PDFPreprocessorParser parser = new PDFPreprocessorParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType pdf = MediaType.application("pdf");

    // With an invalid path, will offer no types
    ImageMagickConfig invalidConfig = new ImageMagickConfig();
    invalidConfig.setImageMagickPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(ImageMagickConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass());
}
 
Example 5
Source File: AlterPDFParserTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
@Test
public void testDoubleSpacedText() throws Exception {
    PDFParser pdfParser = new AlterPDFParser();
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    context.set(PDFParserConfig.class, config);

    InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf");
    String text = getText(stream, pdfParser, context);
    stream.close();

    assertTrue(text.length() > 100);
}
 
Example 6
Source File: NodeTika.java    From node-tika with MIT License 5 votes vote down vote up
private static void fillPassword(ParseContext parseContext, Map<String, Object> options) {
	final Object password = options.get("password");

	if (password == null) {
		return;
	}

	parseContext.set(PasswordProvider.class, new PasswordProvider() {

		@Override
		public String getPassword(Metadata metadata) {
			return password.toString();
		}
	});
}
 
Example 7
Source File: EmbeddedDocumentMemoryExtractor.java    From extract with MIT License 5 votes vote down vote up
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException {
    ParseContext context = new ParseContext();
    ContentHandler handler = new BodyContentHandler(-1);
    context.set(Parser.class, parser);

    DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm);
    context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor);

    parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context);

    return extractor.getDocument();
}
 
Example 8
Source File: TikaLambdaHandler.java    From tika-lambda with Apache License 2.0 5 votes vote down vote up
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException {
  _logger.log("Extracting text with Tika");
  String extractedText = "";

  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance();
  TransformerHandler handler = factory.newTransformerHandler();
  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text");
  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
  StringWriter sw = new StringWriter();
  handler.setResult(new StreamResult(sw));
  AutoDetectParser parser = new AutoDetectParser();
  ParseContext parseContext = new ParseContext();
  parseContext.set(Parser.class, parser);

  Tika tika = new Tika();
  Metadata tikaMetadata = new Metadata();
  try {
    // for synthetic transactions
    if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) {
      throw new TikaException("Test Tika Exception");
    }
    parser.parse(objectData, handler, tikaMetadata, parseContext);
    extractedText = sw.toString();
  } catch( TikaException e) {
    _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage());
    return assembleExceptionResult(bucket, key, e);
  }
  _logger.log("Tika parsing success");
  return assembleExtractionResult(bucket, key, extractedText, tikaMetadata);
}
 
Example 9
Source File: TikaIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  ReadableFile file = c.element();
  InputStream stream = Channels.newInputStream(file.open());
  try (InputStream tikaStream = TikaInputStream.get(stream)) {
    Parser parser =
        tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig);

    ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Metadata tikaMetadata =
        spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata();
    if (spec.getContentTypeHint() != null) {
      tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint());
    }

    String location = file.getMetadata().resourceId().toString();
    ParseResult res;
    ContentHandler tikaHandler = new ToTextContentHandler();
    try {
      parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
      res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata);
    } catch (Exception e) {
      res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e);
    }

    c.output(res);
  }
}
 
Example 10
Source File: ParseContextConfig.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings({"rawtypes", "unchecked"})
public ParseContext create() {
  final ParseContext result = new ParseContext();

  for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){
    result.set((Class) entry.getKey(), entry.getValue());
  }

  return result;
}
 
Example 11
Source File: TikaPoweredMetadataExtracter.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * By default returns a new ParseContent
 * 
 * @param metadata
 * @param sourceMimeType
 * @return the parse context
 */
protected ParseContext buildParseContext(Metadata metadata, String sourceMimeType)
{
    ParseContext context = new ParseContext();
    DocumentSelector selector = getDocumentSelector(metadata, sourceMimeType);
    if (selector != null)
    {
        context.set(DocumentSelector.class, selector);
    }
    return context;
}
 
Example 12
Source File: ArchiveContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata,
     String targetMimeType, TransformationOptions options) {
  ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
  
  boolean recurse = includeContents;
  if(options.getIncludeEmbedded() != null)
  {
     recurse = options.getIncludeEmbedded();
  }
  
  if(recurse)
  {
     // Use an auto detect parser to handle the contents
     if(tikaConfig == null)
     {
         tikaConfig = TikaConfig.getDefaultConfig();
     }
     context.set(Parser.class, new AutoDetectParser(tikaConfig));
  }
  else
  {
      // REPO-1066: an AutoDetectParser is the default in Tika after: https://issues.apache.org/jira/browse/TIKA-2096
      // so we need to specify an empty one if we don't want the recurse parsing to happen
      context.set(Parser.class, new EmptyParser());
  }
  return context;
}
 
Example 13
Source File: PdfBoxContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Override
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
    ParseContext context = super.buildParseContext(metadata, targetMimeType, options);
    if (pdfParserConfig != null)
    {
        pdfParserConfig.setExtractBookmarksText(extractBookmarksText);
        context.set(PDFParserConfig.class, pdfParserConfig);
    }
    // TODO: Possibly extend TransformationOptions to allow for per-transform PDFParserConfig?
    return context;
}
 
Example 14
Source File: TikaPoweredContentTransformer.java    From alfresco-repository with GNU Lesser General Public License v3.0 5 votes vote down vote up
/**
 * By default returns a ParseContent that does not recurse
 */
protected ParseContext buildParseContext(Metadata metadata, String targetMimeType, TransformationOptions options)
{
   ParseContext context = new ParseContext();
   DocumentSelector selector = getDocumentSelector(metadata, targetMimeType, options);
   if (selector != null)
   {
       context.set(DocumentSelector.class, selector);
   }
   return context;
}
 
Example 15
Source File: AlterPDFParserTest.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private String getTextFromDoc(String docPath,
                              AlterPDFParser.ParsePdfMode parseMode) throws Exception {
    AlterPDFParser pdfParser = new AlterPDFParser();
    pdfParser.defaultParseMode = parseMode;
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    context.set(PDFParserConfig.class, config);

    InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath);
    String text = getText(stream, pdfParser, context);
    stream.close();
    return text;
}
 
Example 16
Source File: AlterPDFParser.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler,
                                  ParseContext context, Metadata metadata,
                                  PDFParserConfig config) throws
        ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {

    TesseractOCRConfig cfg = new TesseractOCRConfig();
    // here I set default timeout of 2 hours
    // The calling process should check parsing process and terminate it by timeout
    cfg.setTimeout(60 * 60 * 2);
    context.set(TesseractOCRConfig.class, cfg);

    PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy();
    boolean oldExtractInlineImages = config.getExtractInlineImages();
    boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly();

    // explicitly tells Tika to use OCR
    config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);

    Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML");
    Method m = c.getDeclaredMethod("process",
            PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class,
            PDFParserConfig.class);
    m.setAccessible(true);
    m.invoke(null, document, handler, context, metadata, config);

    config.setOcrStrategy(oldOcrStrategy);
    config.setExtractInlineImages(oldExtractInlineImages);
    config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly);
}
 
Example 17
Source File: HTMLRenderingEngine.java    From alfresco-repository with GNU Lesser General Public License v3.0 4 votes vote down vote up
/**
 * Asks Tika to translate the contents into HTML
 */
private void generateHTML(Parser p, RenderingContext context)
{
   ContentReader contentReader = context.makeContentReader();
   
   // Setup things to parse with
   StringWriter sw = new StringWriter();
   ContentHandler handler = buildContentHandler(sw, context);
   
   // Tell Tika what we're dealing with
   Metadata metadata = new Metadata();
   metadata.set(
         Metadata.CONTENT_TYPE, 
         contentReader.getMimetype()
   );
   metadata.set(
         Metadata.RESOURCE_NAME_KEY, 
         nodeService.getProperty( 
               context.getSourceNode(),
               ContentModel.PROP_NAME
         ).toString()
   );

   // Our parse context needs to extract images
   ParseContext parseContext = new ParseContext();
   parseContext.set(Parser.class, new TikaImageExtractingParser(context));
   
   // Parse
   try {
      p.parse(
            contentReader.getContentInputStream(),
            handler, metadata, parseContext
      );
   } catch(Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
   }
   
   // As a string
   String html = sw.toString();
   
   // If we're doing body-only, remove all the html namespaces
   //  that will otherwise clutter up the document
   boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
   if(bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"","<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"","<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"","<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"","<table");
      html = html.replaceAll("&#13;","");
   }
   
   // Save it
   ContentWriter contentWriter = context.makeContentWriter();
   contentWriter.setMimetype("text/html");
   contentWriter.putContent( html );
}
 
Example 18
Source File: TikaParser.java    From quarkus with Apache License 2.0 4 votes vote down vote up
protected TikaContent parseStream(InputStream entityStream, String contentType, ContentHandler tikaHandler)
        throws TikaParseException {
    try {
        ParseContext context = new ParseContext();
        // AutoDetectParser must be set in the context to enable the parsing of the embedded content
        Parser contextParser = this.appendEmbeddedContent ? parser : ((RecursiveParserWrapper) parser).getWrappedParser();
        context.set(Parser.class, contextParser);

        org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata();
        if (contentType != null) {
            tikaMetadata.set(HttpHeaders.CONTENT_TYPE, contentType);
        }

        try (InputStream tikaStream = TikaInputStream.get(entityStream)) {
            parser.parse(tikaStream, tikaHandler, tikaMetadata, context);
            if (this.appendEmbeddedContent) {
                // the embedded content if any has already been appended to the master content
                return new TikaContent(tikaHandler == null ? null : tikaHandler.toString().trim(), convert(tikaMetadata));
            } else {
                RecursiveParserWrapperHandler rHandler = (RecursiveParserWrapperHandler) tikaHandler;

                // The metadata list represents the master and embedded content (text and metadata)
                // The first metadata in the list represents the master (outer) content
                List<org.apache.tika.metadata.Metadata> allMetadata = rHandler.getMetadataList();
                String masterText = allMetadata.get(0).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);

                // Embedded (inner) content starts from the index 1.
                List<TikaContent> embeddedContent = new LinkedList<>();
                for (int i = 1; i < allMetadata.size(); i++) {
                    String embeddedText = allMetadata.get(i).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
                    // the embedded text can be null if the given document is an image
                    // and no text recognition parser is enabled
                    if (embeddedText != null) {
                        embeddedContent.add(new TikaContent(embeddedText.trim(), convert(allMetadata.get(i))));
                    }
                }
                return new TikaContent(masterText, convert(allMetadata.get(0)), embeddedContent);

            }
        }
    } catch (Exception e) {
        final String errorMessage = "Unable to parse the stream"
                + (contentType == null ? "" : " for content-type: " + contentType);
        throw new TikaParseException(errorMessage, e);
    }
}
 
Example 19
Source File: NodeTika.java    From node-tika with MIT License 4 votes vote down vote up
public static String extractText(String uri, Map<String, Object> options) throws Exception {
	final AutoDetectParser parser = createParser();
	final Metadata metadata = new Metadata();
	final ParseContext context = new ParseContext();

	String outputEncoding = null;
	String contentType = null;
	int maxLength = -1;

	if (options != null) {
		Object option;

		option = options.get("outputEncoding");
		if (option != null) {
			outputEncoding = option.toString();
		}

		option = options.get("contentType");
		if (option != null) {
			contentType = option.toString();
		}

		option = options.get("maxLength");
		if (option != null) {
			maxLength = (int)Float.parseFloat(option.toString());
		}
	}

	if (outputEncoding == null) {
		outputEncoding = "UTF-8";
	}

	fillMetadata(parser, metadata, contentType, uri);
	fillParseContext(context, options);

	final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	final OutputStreamWriter writer = new OutputStreamWriter(outputStream, outputEncoding);
	final WriteOutContentHandler contentHandler = new WriteOutContentHandler(writer, maxLength);

	final TikaInputStream inputStream = createInputStream(uri, metadata);

	// Set up recursive parsing of archives.
	// See: http://wiki.apache.org/tika/RecursiveMetadata
	context.set(Parser.class, parser);
	context.set(EmbeddedDocumentExtractor.class, new ParsingEmbeddedDocumentExtractor(context));

	try {
		parser.parse(inputStream, new BodyContentHandler(contentHandler), metadata, context);
	} catch (Throwable e) {
		if (!contentHandler.isWriteLimitReached(e)) {
			throw e;
		} else {
			writer.close();
		}
	} finally {
		inputStream.close();
	}

	return outputStream.toString(outputEncoding);
}
 
Example 20
Source File: SimplePageParser.java    From flink-crawler with Apache License 2.0 3 votes vote down vote up
/**
 * @param parserPolicy
 *            to customize operation of the parser
 * @param pageScorer
 *            to score importance of page (priority of its outlinks)
 * @param includeMarkup
 *            true if output should be raw HTML, versus extracted text <BR>
 * <BR>
 *            <B>Note:</B> There is no need to construct your own {@link SimpleLinkExtractor}
 *            simply to control the set of link tags and attributes it processes. Instead, use
 *            {@link ParserPolicy#setLinkTags} and {@link ParserPolicy#setLinkAttributeTypes},
 *            and then pass this policy to {@link SimplePageParser#SimpleParser(ParserPolicy)}.
 */
public SimplePageParser(BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor,
        ParserPolicy parserPolicy, BasePageScorer pageScorer, boolean includeMarkup) {
    super(parserPolicy, pageScorer);

    _contentExtractor = contentExtractor;
    _linkExtractor = linkExtractor;

    if (includeMarkup) {
        _parseContext = new ParseContext();
        _parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
    }
}