org.apache.tika.parser.ParseContext Java Examples
The following examples show how to use
org.apache.tika.parser.ParseContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ContentExtractor.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
private String parseTXTToString(InputStream stream, Metadata metadata) throws IOException, TikaException { WriteOutContentHandler handler = new WriteOutContentHandler(maxStringLength); try { ParseContext context = new ParseContext(); context.set(Parser.class, txtParser); txtParser.parse(stream, new BodyContentHandler(handler), metadata, context); } catch (SAXException e) { if (!handler.isWriteLimitReached(e)) { // This should never happen with BodyContentHandler... throw new TikaException("Unexpected SAX processing failure", e); } } finally { stream.close(); } return handler.toString(); }
Example #2
Source File: AlterPDFParser.java From tika-server with Apache License 2.0 | 6 votes |
private void callPDF2XHTMLProcess(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config, boolean noOCR) throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { Class c = Class.forName("org.apache.tika.parser.pdf.PDF2XHTML"); Method m = c.getDeclaredMethod("process", PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class, PDFParserConfig.class); m.setAccessible(true); PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy(); config.setOcrStrategy(noOCR ? PDFParserConfig.OCR_STRATEGY.NO_OCR : PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION); m.invoke(null, document, handler, context, metadata, config); config.setOcrStrategy(oldOcrStrategy); }
Example #3
Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { TemporaryResources tmp = new TemporaryResources(); FileOutputStream fos = null; TikaInputStream tis = null; try { int w = image.getWidth(null); int h = image.getHeight(null); BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); File file = tmp.createTemporaryFile(); fos = new FileOutputStream(file); ImageIO.write(bImage, "png", fos); tis = TikaInputStream.get(file); parse(tis, handler, metadata, context); } finally { tmp.dispose(); if (tis != null) tis.close(); if (fos != null) fos.close(); } }
Example #4
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@Test public void offersNoTypesIfNotFound() throws Exception { PDFPreprocessorParser parser = new PDFPreprocessorParser(); DefaultParser defaultParser = new DefaultParser(); MediaType pdf = MediaType.application("pdf"); // With an invalid path, will offer no types ImageMagickConfig invalidConfig = new ImageMagickConfig(); invalidConfig.setImageMagickPath("/made/up/path"); ParseContext parseContext = new ParseContext(); parseContext.set(ImageMagickConfig.class, invalidConfig); // No types offered assertEquals(0, parser.getSupportedTypes(parseContext).size()); // And DefaultParser won't use us assertEquals(PDFParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass()); }
Example #5
Source File: PDFPreprocessorParserTest.java From CogStack-Pipeline with Apache License 2.0 | 6 votes |
@Test public void offersTypesIfFound() throws Exception { PDFPreprocessorParser parser = new PDFPreprocessorParser(); //DefaultParser defaultParser = new DefaultParser(); ParseContext parseContext = new ParseContext(); MediaType pdf = MediaType.application("pdf"); // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG. assumeTrue(canRun()); assertEquals(1, parser.getSupportedTypes(parseContext).size()); assertTrue(parser.getSupportedTypes(parseContext).contains(pdf)); // DefaultParser will not select the PDFPreprocessorParser, unless configured in tika config //assertEquals(PDFPreprocessorParser.class, defaultParser.getParsers(parseContext).get(pdf).getClass()); }
Example #6
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 6 votes |
/** * Decide if we need to set up our own HtmlMapper, because the link extractor has tags that aren't part of the * default set. * * @return */ private ParseContext makeParseContext() { ParseContext result = new ParseContext(); Set<String> validTags = _linkExtractor.getLinkTags(); HtmlMapper defaultMapper = DefaultHtmlMapper.INSTANCE; for (String tag : validTags) { if (defaultMapper.mapSafeElement(tag) == null) { result.set(HtmlMapper.class, new CustomHtmlMapper(validTags, _linkExtractor.getLinkAttributeTypes())); break; } } return result; }
Example #7
Source File: NodeTika.java From node-tika with MIT License | 6 votes |
private static void fillParseContext(ParseContext parseContext, Map<String, Object> options) { final TesseractOCRConfig ocrConfig = new TesseractOCRConfig(); if (options == null) { // Disable OCR and return if no options are specified. disableOcr(ocrConfig); parseContext.set(TesseractOCRConfig.class, ocrConfig); return; } fillOcrOptions(ocrConfig, options); parseContext.set(TesseractOCRConfig.class, ocrConfig); final PDFParserConfig pdfParserConfig = new PDFParserConfig(); fillPdfOptions(pdfParserConfig, options); parseContext.set(PDFParserConfig.class, pdfParserConfig); // Allow a password to be specified for encrypted files. fillPassword(parseContext, options); }
Example #8
Source File: FTConnector.java From openprodoc with GNU Affero General Public License v3.0 | 6 votes |
/** * * @param Bytes * @return * @throws PDException */ protected String Convert(InputStream Bytes) throws PDException { try { ContentHandler textHandler=new BodyContentHandler(-1); Metadata metadata=new Metadata(); Parser parser=new AutoDetectParser(); ParseContext context=new ParseContext(); parser.parse(Bytes, textHandler, metadata, context); FileMetadata=""; for (String key : metadata.names()) FileMetadata+=key+"="+metadata.get(key)+"\n"; FullText=textHandler.toString(); } catch (Exception ex) { PDException.GenPDException("Error_extracting_content_from_doc", ex.getLocalizedMessage()); } return(FullText); }
Example #9
Source File: TikaContentExtractor.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); jCas.setDocumentText(textHandler.toString()); for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); if (Strings.isNullOrEmpty(jCas.getDocumentText())) { jCas.setDocumentText(CORRUPT_FILE_TEXT); } } }
Example #10
Source File: ImageConverter.java From openmeetings with Apache License 2.0 | 6 votes |
private static ProcessResult initSize(BaseFileItem f, File img, String mime) { ProcessResult res = new ProcessResult(); res.setProcess("get image dimensions :: " + f.getId()); final Parser parser = new ImageParser(); try (InputStream is = new FileInputStream(img)) { Metadata metadata = new Metadata(); metadata.set(CONTENT_TYPE, mime); parser.parse(is, new DefaultHandler(), metadata, new ParseContext()); f.setWidth(Integer.valueOf(metadata.get(TIFF.IMAGE_WIDTH))); f.setHeight(Integer.valueOf(metadata.get(TIFF.IMAGE_LENGTH))); res.setExitCode(ZERO); } catch (Exception e) { log.error("Error while getting dimensions", e); res.setError("Error while getting dimensions"); res.setException(e.getMessage()); res.setExitCode(-1); } return res; }
Example #11
Source File: TikaExtractor.java From ache with Apache License 2.0 | 6 votes |
public ParsedData parse(InputStream stream, String fileName, String contentType) { BodyContentHandler handler = new BodyContentHandler(MAX_CHARACTERS); BoilerpipeContentHandler textHandler = new BoilerpipeContentHandler(handler, KeepEverythingExtractor.INSTANCE); Metadata metadata = createMetadata(fileName, contentType); ParseContext context = new ParseContext(); try { parser.parse(stream, textHandler, metadata, context); Map<String, String> metadataMap = new HashMap<String, String>(); for (String propertyName : metadata.names()) { metadataMap.put(propertyName, metadata.get(propertyName)); } return new ParsedData(handler.toString(), metadataMap); } catch (IOException | SAXException | TikaException e) { logger.error("Failed to extract metadata using Tika.", e); return null; } }
Example #12
Source File: AlterPDFParser.java From tika-server with Apache License 2.0 | 5 votes |
private void extractAndCheckMetadata(Metadata metadata, ParseContext context, PDFParserConfig localConfig, PDDocument pdfDocument) throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, AccessPermissionException { metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted())); metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString()); callExtractMetadata(pdfDocument, metadata, context); AccessChecker checker = localConfig.getAccessChecker(); checker.check(metadata); }
Example #13
Source File: HtmlSchemaParser.java From data-prep with Apache License 2.0 | 5 votes |
/** * @see SchemaParser#parse(Request) */ @Override public Schema parse(Request request) { try { SimpleHeadersContentHandler headersContentHandler = new SimpleHeadersContentHandler(); InputStream inputStream = request.getContent(); HtmlParser htmlParser = new HtmlParser(); Metadata metadata = new Metadata(); htmlParser.parse(inputStream, headersContentHandler, metadata, new ParseContext()); List<ColumnMetadata> columns = new ArrayList<>(headersContentHandler.getHeaderValues().size()); for (String headerValue : headersContentHandler.getHeaderValues()) { columns.add(ColumnMetadata.Builder .column() // .type(Type.STRING) // ATM not doing any complicated type calculation .name(headerValue) // .id(columns.size()) // .build()); } Schema.SheetContent sheetContent = new Schema.SheetContent(); sheetContent.setColumnMetadatas(columns); return Schema.Builder .parserResult() // .sheetContents(Collections.singletonList(sheetContent)) // .draft(false) // .build(); } catch (Exception e) { LOGGER.debug("Exception during parsing html request :" + e.getMessage(), e); throw new TDPException(CommonErrorCodes.UNEXPECTED_EXCEPTION, e); } }
Example #14
Source File: TearlineContentExtractor.java From baleen with Apache License 2.0 | 5 votes |
@Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if (m.find()) { jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); } else { jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } }
Example #15
Source File: AlterPDFParser.java From tika-server with Apache License 2.0 | 5 votes |
private void callOCR2XHTMLProcess(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { TesseractOCRConfig cfg = new TesseractOCRConfig(); // here I set default timeout of 2 hours // The calling process should check parsing process and terminate it by timeout cfg.setTimeout(60 * 60 * 2); context.set(TesseractOCRConfig.class, cfg); PDFParserConfig.OCR_STRATEGY oldOcrStrategy = config.getOcrStrategy(); boolean oldExtractInlineImages = config.getExtractInlineImages(); boolean oldExtractUniqueInlineImagesOnly = config.getExtractUniqueInlineImagesOnly(); // explicitly tells Tika to use OCR config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); config.setExtractInlineImages(true); config.setExtractUniqueInlineImagesOnly(false); Class c = Class.forName("org.apache.tika.parser.pdf.OCR2XHTML"); Method m = c.getDeclaredMethod("process", PDDocument.class, ContentHandler.class, ParseContext.class, Metadata.class, PDFParserConfig.class); m.setAccessible(true); m.invoke(null, document, handler, context, metadata, config); config.setOcrStrategy(oldOcrStrategy); config.setExtractInlineImages(oldExtractInlineImages); config.setExtractUniqueInlineImagesOnly(oldExtractUniqueInlineImagesOnly); }
Example #16
Source File: AlterPDFParser.java From tika-server with Apache License 2.0 | 5 votes |
private void callHandleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata, ParseContext context) throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { Method m = getClass().getSuperclass().getDeclaredMethod("handleXFAOnly", PDDocument.class, ContentHandler.class, Metadata.class, ParseContext.class); m.setAccessible(true); m.invoke(this, pdDocument, handler, metadata, context); }
Example #17
Source File: ExcelHtmlParser.java From components with Apache License 2.0 | 5 votes |
public static List<List<String>> getRows(InputStream rawContent, String encoding, long limit) { SimpleValuesContentHandler valuesContentHandler = new SimpleValuesContentHandler(-1, limit); HtmlParser htmlParser = new HtmlParser(); Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_ENCODING, encoding); try { htmlParser.parse(rawContent, valuesContentHandler, metadata, new ParseContext()); } catch (Exception e) { LOGGER.debug("Failed to parse the excel html format document.", e); } return valuesContentHandler.getValues(); }
Example #18
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
private void cachedParse(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata, final ParseContext context, final TesseractOCRConfig config, final boolean inline) throws IOException, SAXException, TikaException, InterruptedException { final String hash; try (final InputStream buffered = Files.newInputStream(tis.getPath())) { hash = DigestUtils.sha256Hex(buffered); } final Path cachePath = outputPath.resolve(hash); final Path cacheLock = outputPath.resolve(hash + ".lock"); // Acquire a lock both for reading and for writing. // If the lock can't be acquired, parse without caching. if (!acquireLock(config, cacheLock)) { fallbackParse(tis, handler, metadata, context, config, inline); return; } // You won't know for sure until you try.... try (final Reader reader = Files.newBufferedReader(cachePath, UTF_8)) { cacheHit(); readFromCache(reader, handler, metadata); } catch (final NoSuchFileException e) { final Path cacheTemp = outputPath.resolve(hash + ".tmp"); // Write to a temporary file and only move to the final path if parsing completes successfully. // This way we ensure that we don't cache partial results from Tesseract if there's an error. try (final Writer writer = Files.newBufferedWriter(cacheTemp, UTF_8, StandardOpenOption.CREATE)) { cacheMiss(); parseToCache(tis, handler, metadata, context, config, inline, writer); } Files.move(cacheTemp, cachePath, StandardCopyOption.ATOMIC_MOVE); } finally { Files.deleteIfExists(cacheLock); } }
Example #19
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
private void cachedParse(final InputStream in, final ContentHandler handler, final Metadata metadata, final ParseContext context, TesseractOCRConfig config, final boolean inline) throws IOException, SAXException, TikaException { try (final TikaInputStream tis = TikaInputStream.get(in)) { cachedParse(tis, handler, metadata, context, config, inline); } catch (final InterruptedException e) { throw new TikaException("Interrupted.", e); } }
Example #20
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
private void parseToCache(final TikaInputStream tis, final ContentHandler handler, final Metadata metadata, final ParseContext context, final TesseractOCRConfig config, final boolean inline, final Writer writer) throws SAXException, IOException, TikaException { final ContentHandler tee = new TeeContentHandler(handler, new WriteOutContentHandler(writer)); if (inline) { super.parseInline(tis, new XHTMLContentHandler(tee, metadata), context, config); } else { super.parse(tis, tee, metadata, context); } }
Example #21
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
@Override public void parseInline(final InputStream in, final XHTMLContentHandler xhtml, final ParseContext context, final TesseractOCRConfig config) throws IOException, SAXException, TikaException { if (null != outputPath) { cachedParse(in, xhtml, new Metadata(), context, null == config ? context.get(TesseractOCRConfig.class, DEFAULT_CONFIG) : config, true); } else { super.parseInline(in, xhtml, context, config); } }
Example #22
Source File: CachingTesseractOCRParser.java From extract with MIT License | 5 votes |
@Override public void parse(final InputStream in, final ContentHandler handler, final Metadata metadata, final ParseContext context) throws IOException, SAXException, TikaException { if (null != outputPath) { cachedParse(in, handler, metadata, context, context.get(TesseractOCRConfig.class, DEFAULT_CONFIG), false); } else { super.parse(in, handler, metadata, context); } }
Example #23
Source File: TikaLambdaHandler.java From tika-lambda with Apache License 2.0 | 5 votes |
private String doTikaStuff(String bucket, String key, InputStream objectData) throws IOException, TransformerConfigurationException, SAXException { _logger.log("Extracting text with Tika"); String extractedText = ""; SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "text"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); StringWriter sw = new StringWriter(); handler.setResult(new StreamResult(sw)); AutoDetectParser parser = new AutoDetectParser(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); Tika tika = new Tika(); Metadata tikaMetadata = new Metadata(); try { // for synthetic transactions if( key.toLowerCase().endsWith("tika.exception.testing.pdf")) { throw new TikaException("Test Tika Exception"); } parser.parse(objectData, handler, tikaMetadata, parseContext); extractedText = sw.toString(); } catch( TikaException e) { _logger.log("TikaException thrown while parsing: " + e.getLocalizedMessage()); return assembleExceptionResult(bucket, key, e); } _logger.log("Tika parsing success"); return assembleExtractionResult(bucket, key, extractedText, tikaMetadata); }
Example #24
Source File: TikaIO.java From beam with Apache License 2.0 | 5 votes |
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } }
Example #25
Source File: ParseContextConfig.java From lucene-solr with Apache License 2.0 | 5 votes |
@SuppressWarnings({"rawtypes", "unchecked"}) public ParseContext create() { final ParseContext result = new ParseContext(); for (Map.Entry<Class<?>, Object> entry : entries.entrySet()){ result.set((Class) entry.getKey(), entry.getValue()); } return result; }
Example #26
Source File: TikaCallable.java From flink-crawler with Apache License 2.0 | 5 votes |
public TikaCallable(Parser parser, BaseContentExtractor contentExtractor, BaseLinkExtractor linkExtractor, InputStream input, Metadata metadata, boolean extractLanguage, ParseContext parseContext) { _parser = parser; _contentExtractor = contentExtractor; _linkExtractor = linkExtractor; _input = input; _metadata = metadata; _extractLanguage = extractLanguage; _parseContext = parseContext; }
Example #27
Source File: WidgetMacroLibraryTests.java From scipio-erp with Apache License 2.0 | 5 votes |
public void testFopMacroLibrary() throws Exception { String screentextUrl = screenUrl.concat("Fop"); HttpClient http = initHttpClient(); http.setUrl(screentextUrl.concat(authentificationQuery)); //FIXME need to check if the stream is an application-pdf that don't contains ftl stack trace InputStream screenInputStream = (InputStream) http.postStream(); assertNotNull("Response failed from ofbiz", screenInputStream); assertEquals("Response contentType isn't good : " + http.getResponseContentType(), "application/pdf;charset=UTF-8", http.getResponseContentType()); String screenOutString = ""; try { BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); new PDFParser().parse(screenInputStream, handler, metadata, new ParseContext()); screenOutString = handler.toString(); } finally { screenInputStream.close(); } //Test if a ftl macro error is present assertFalse("Fop Screen contains Macro on error : see " + screentextUrl + " for more detail", screenOutString.contains("FreeMarker template error:")); }
Example #28
Source File: TesseractOCRParser.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Override public Set<MediaType> getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG); if (hasTesseract(config)) return SUPPORTED_TYPES; // Otherwise don't advertise anything, so the other image parsers // can be selected instead return Collections.emptySet(); }
Example #29
Source File: PDFPreprocessorParser.java From CogStack-Pipeline with Apache License 2.0 | 5 votes |
@Override public Set<MediaType> getSupportedTypes(ParseContext context) { // If ImageMagick is installed, offer our supported image types ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG); if (hasImageMagick(imconfig)) { return SUPPORTED_TYPES; } // Otherwise don't advertise anything, so the other parsers // can be selected instead return Collections.emptySet(); }
Example #30
Source File: EmbeddedDocumentMemoryExtractor.java From extract with MIT License | 5 votes |
public TikaDocumentSource extract(final TikaDocument rootDocument, final String embeddedDocumentDigest) throws SAXException, TikaException, IOException { ParseContext context = new ParseContext(); ContentHandler handler = new BodyContentHandler(-1); context.set(Parser.class, parser); DigestEmbeddedDocumentExtractor extractor = new DigestEmbeddedDocumentExtractor(rootDocument, embeddedDocumentDigest, context, digester, algorithm); context.set(org.apache.tika.extractor.EmbeddedDocumentExtractor.class, extractor); parser.parse(new FileInputStream(rootDocument.getPath().toFile()), handler, rootDocument.getMetadata(), context); return extractor.getDocument(); }