Java Code Examples for org.apache.pdfbox.pdmodel.PDResources#getXObjectNames()

The following examples show how to use org.apache.pdfbox.pdmodel.PDResources#getXObjectNames() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PdfContentImagePreprocessor.java    From tika-server with Apache License 2.0 7 votes vote down vote up
private void processImagesFromResources(PDResources resources) throws IOException {
    for (COSName xObjectName : resources.getXObjectNames()) {
        PDXObject xObject = resources.getXObject(xObjectName);

        if (xObject instanceof PDFormXObject) {
            processImagesFromResources(((PDFormXObject) xObject).getResources());
        } else if (xObject instanceof PDImageXObject) {
            PDImageXObject img = (PDImageXObject) xObject;
            if (!img.getImage().getColorModel().hasAlpha())
                return;

            PDImageXObject cpy = makeImageObjectCopy(img);
            resources.put(xObjectName, cpy);
            imagesWereChanged = true;
        }
    }
}
 
Example 2
Source File: PdfFontExtractor.java    From FontVerter with GNU Lesser General Public License v3.0 6 votes vote down vote up
private void extractFontResources(PDResources resources) throws IOException {
    for (COSName key : resources.getFontNames()) {
        PDFont font = resources.getFont(key);
        extractStrategy.extract(font);
    }

    for (COSName name : resources.getXObjectNames()) {
        PDXObject xobject = resources.getXObject(name);
        if (xobject instanceof PDFormXObject) {
            PDFormXObject xObjectForm = (PDFormXObject) xobject;
            PDResources formResources = xObjectForm.getResources();

            if (formResources != null)
                extractFontResources(formResources);
        }
    }
}
 
Example 3
Source File: PdfContentTypeChecker.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void getImagesFromResources(PDResources resources) throws IOException {
    for (COSName xObjectName : resources.getXObjectNames()) {
        PDXObject xObject = resources.getXObject(xObjectName);

        if (xObject instanceof PDFormXObject) {
            getImagesFromResources(((PDFormXObject) xObject).getResources());
        } else if (xObject instanceof PDImageXObject) {
            //((PDImageXObject) xObject).getImage();
            imagesCount++;
        }
    }
}
 
Example 4
Source File: PdfExtractImagesBatchController.java    From MyBox with Apache License 2.0 5 votes vote down vote up
@Override
public int handleCurrentPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                String namePrefix = FileTools.getFilePrefix(currentParameters.currentSourceFile.getName())
                        + "_page" + currentParameters.currentPage + "_index" + index;
                String suffix = pdxObject.getSuffix();
                File tFile = makeTargetFile(namePrefix, "." + suffix, currentParameters.currentTargetPath);
                ImageFileWriters.writeImageFile(pdxObject.getImage(), suffix, tFile.getAbsolutePath());
                targetFileGenerated(tFile);
                if (isPreview) {
                    break;
                }
                index++;
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}
 
Example 5
Source File: PdfOcrBatchController.java    From MyBox with Apache License 2.0 5 votes vote down vote up
protected int extractPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                BufferedImage bufferedImage = pdxObject.getImage();
                if (handleImage(bufferedImage)) {
                    lastImage = bufferedImage;
                    if (isPreview) {
                        break;
                    }
                    index++;
                }
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}
 
Example 6
Source File: ExtractText.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
void removeToUnicodeMaps(PDResources pdResources) throws IOException
{
    COSDictionary resources = pdResources.getCOSObject();

    COSDictionary fonts = asDictionary(resources, COSName.FONT);
    if (fonts != null)
    {
        for (COSBase object : fonts.getValues())
        {
            while (object instanceof COSObject)
                object = ((COSObject)object).getObject();
            if (object instanceof COSDictionary)
            {
                COSDictionary font = (COSDictionary)object;
                font.removeItem(COSName.TO_UNICODE);
            }
        }
    }

    for (COSName name : pdResources.getXObjectNames())
    {
        PDXObject xobject = pdResources.getXObject(name);
        if (xobject instanceof PDFormXObject)
        {
            PDResources xobjectPdResources = ((PDFormXObject)xobject).getResources();
            removeToUnicodeMaps(xobjectPdResources);
        }
    }
}
 
Example 7
Source File: ExtractImages.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B9izTHWJQ7xlT2ZoQkJfbGRYcFE">
 * 10948.pdf
 * </a>
 * <p>
 * The only special thing about the two images returned for the sample PDF is that
 * one image is merely a mask used for the other image, and the other image is the
 * actual image used on the PDF page. If one only wants the images immediately used
 * in the page content, one also has to scan the page content.
 * </p>
 */
@Test
public void testExtractPageImageResources10948() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}
 
Example 8
Source File: ExtractImages.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
 * 10948.pdf
 * </a>, renamed "10948-new.pdf" here to prevent a collision
 * <p>
 * Here the code extracts no image at all because the images are not immediate page
 * resources but wrapped in form xobjects.
 * </p>
 */
@Test
public void testExtractPageImageResources10948New() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-new-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}
 
Example 9
Source File: PDFBoxTree.java    From Pdf2Dom with GNU Lesser General Public License v3.0 5 votes vote down vote up
private void processFontResources(PDResources resources, FontTable table) throws IOException
{
    String fontNotSupportedMessage = "Font: {} skipped because type '{}' is not supported.";

    for (COSName key : resources.getFontNames())
    {
        PDFont font = resources.getFont(key);
        if (font instanceof PDTrueTypeFont)
        {
            table.addEntry( font);
            log.debug("Font: " + font.getName() + " TTF");
        }
        else if (font instanceof PDType0Font)
        {
            PDCIDFont descendantFont = ((PDType0Font) font).getDescendantFont();
            if (descendantFont instanceof PDCIDFontType2)
                table.addEntry(font);
            else
                log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName());
        }
        else if (font instanceof PDType1CFont)
            table.addEntry(font);
        else
            log.warn(fontNotSupportedMessage, font.getName(), font.getClass().getSimpleName());
    }

    for (COSName name : resources.getXObjectNames())
    {
        PDXObject xobject = resources.getXObject(name);
        if (xobject instanceof PDFormXObject)
        {
            PDFormXObject xObjectForm = (PDFormXObject) xobject;
            PDResources formResources = xObjectForm.getResources();
            if (formResources != null && formResources != resources && formResources.getCOSObject() != resources.getCOSObject())
                processFontResources(formResources, table);
        }
    }

}
 
Example 10
Source File: PdfCompressImagesBatchController.java    From MyBox with Apache License 2.0 4 votes vote down vote up
@Override
public int handleCurrentPage() {
    int count = 0;
    try {
        PDPage sourcePage = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = sourcePage.getResources();
        pdResources.getXObjectNames();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable == null) {
            return 0;
        }
        Iterator<COSName> pageIterator = iterable.iterator();
        while (pageIterator.hasNext()) {
            if (task.isCancelled()) {
                break;
            }
            COSName cosName = pageIterator.next();
            if (!pdResources.isImageXObject(cosName)) {
                continue;
            }
            PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
            BufferedImage sourceImage = pdxObject.getImage();
            PDImageXObject newObject = null;
            if (format == PdfImageFormat.Tiff) {
                ImageBinary imageBinary = new ImageBinary(sourceImage, threshold);
                imageBinary.setIsDithering(ditherCheck.isSelected());
                BufferedImage newImage = imageBinary.operate();
                newImage = ImageBinary.byteBinary(newImage);
                newObject = CCITTFactory.createFromImage(doc, newImage);

            } else if (format == PdfImageFormat.Jpeg) {
                newObject = JPEGFactory.createFromImage(doc, sourceImage, jpegQuality / 100f);
            }
            if (newObject != null) {
                pdResources.put(cosName, newObject);
                count++;
            }
            if (isPreview) {
                break;
            }
        }
        if (copyAllCheck.isSelected()) {
            targetDoc.getPage(currentParameters.currentPage - 1).setResources(pdResources);
        } else {
            targetDoc.addPage(sourcePage);
        }
    } catch (Exception e) {
        logger.error(e.toString());
    }
    return count;

}