Java Code Examples for org.apache.pdfbox.pdmodel.PDPage#getResources()

The following examples show how to use org.apache.pdfbox.pdmodel.PDPage#getResources() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PDFRenderer.java    From gcs with Mozilla Public License 2.0 6 votes vote down vote up
private boolean hasBlendMode(PDPage page)
{
    // check the current resources for blend modes
    PDResources resources = page.getResources();
    if (resources == null)
    {
        return false;
    }
    for (COSName name : resources.getExtGStateNames())
    {
        PDExtendedGraphicsState extGState = resources.getExtGState(name);
        if (extGState == null)
        {
            // can happen if key exists but no value 
            // see PDFBOX-3950-23EGDHXSBBYQLKYOKGZUOVYVNE675PRD.pdf
            continue;
        }
        BlendMode blendMode = extGState.getBlendMode();
        if (blendMode != BlendMode.NORMAL)
        {
            return true;
        }
    }
    return false;
}
 
Example 2
Source File: ExtractText.java    From testarea-pdfbox2 with Apache License 2.0 6 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of immediate text
 * extraction from this document is unsatisfying, cf. {@link #testTest2()}.
 * It can be improved by removing these <b>ToUnicode</b> maps as this test
 * shows.
 * </p>
 */
@Test
public void testNoToUnicodeTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);

        for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
        {
            PDPage page = document.getPage(pageNr);
            PDResources resources = page.getResources();
            removeToUnicodeMaps(resources);
        }

        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
    }
}
 
Example 3
Source File: PdfExtractImagesBatchController.java    From MyBox with Apache License 2.0 5 votes vote down vote up
@Override
public int handleCurrentPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                String namePrefix = FileTools.getFilePrefix(currentParameters.currentSourceFile.getName())
                        + "_page" + currentParameters.currentPage + "_index" + index;
                String suffix = pdxObject.getSuffix();
                File tFile = makeTargetFile(namePrefix, "." + suffix, currentParameters.currentTargetPath);
                ImageFileWriters.writeImageFile(pdxObject.getImage(), suffix, tFile.getAbsolutePath());
                targetFileGenerated(tFile);
                if (isPreview) {
                    break;
                }
                index++;
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}
 
Example 4
Source File: PdfOcrBatchController.java    From MyBox with Apache License 2.0 5 votes vote down vote up
protected int extractPage() {
    int index = 0;
    try {
        PDPage page = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = page.getResources();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable != null) {
            Iterator<COSName> pageIterator = iterable.iterator();
            while (pageIterator.hasNext()) {
                if (task.isCancelled()) {
                    break;
                }
                COSName cosName = pageIterator.next();
                if (!pdResources.isImageXObject(cosName)) {
                    continue;
                }
                PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
                BufferedImage bufferedImage = pdxObject.getImage();
                if (handleImage(bufferedImage)) {
                    lastImage = bufferedImage;
                    if (isPreview) {
                        break;
                    }
                    index++;
                }
            }
        }

    } catch (Exception e) {
        logger.error(e.toString());
    }
    return index;
}
 
Example 5
Source File: Overlay.java    From gcs with Mozilla Public License 2.0 5 votes vote down vote up
private COSName createOverlayXObject(PDPage page, LayoutPage layoutPage)
{
    PDFormXObject xobjForm = new PDFormXObject(layoutPage.overlayContentStream);
    xobjForm.setResources(new PDResources(layoutPage.overlayResources));
    xobjForm.setFormType(1);
    xobjForm.setBBox(layoutPage.overlayMediaBox.createRetranslatedRectangle());
    AffineTransform at = new AffineTransform();
    switch (layoutPage.overlayRotation)
    {
        case 90:
            at.translate(0, layoutPage.overlayMediaBox.getWidth());
            at.rotate(Math.toRadians(-90));
            break;
        case 180:
            at.translate(layoutPage.overlayMediaBox.getWidth(), layoutPage.overlayMediaBox.getHeight());
            at.rotate(Math.toRadians(-180));
            break;
        case 270:
            at.translate(layoutPage.overlayMediaBox.getHeight(), 0);
            at.rotate(Math.toRadians(-270));
            break;
        default:
            break;
    }
    xobjForm.setMatrix(at);
    PDResources resources = page.getResources();
    return resources.add(xobjForm, "OL");
}
 
Example 6
Source File: ExtractImages.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B9izTHWJQ7xlT2ZoQkJfbGRYcFE">
 * 10948.pdf
 * </a>
 * <p>
 * The only special thing about the two images returned for the sample PDF is that
 * one image is merely a mask used for the other image, and the other image is the
 * actual image used on the PDF page. If one only wants the images immediately used
 * in the page content, one also has to scan the page content.
 * </p>
 */
@Test
public void testExtractPageImageResources10948() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}
 
Example 7
Source File: ExtractImages.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
 * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
 * </a>
 * <br/>
 * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
 * 10948.pdf
 * </a>, renamed "10948-new.pdf" here to prevent a collision
 * <p>
 * Here the code extracts no image at all because the images are not immediate page
 * resources but wrapped in form xobjects.
 * </p>
 */
@Test
public void testExtractPageImageResources10948New() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);
        int page = 1;
        for (PDPage pdPage : document.getPages())
        {
            PDResources resources = pdPage.getResources();
            if (resource != null)
            {
                int index = 0;
                for (COSName cosName : resources.getXObjectNames())
                {
                    PDXObject xobject = resources.getXObject(cosName);
                    if (xobject instanceof PDImageXObject)
                    {
                        PDImageXObject image = (PDImageXObject)xobject;
                        File file = new File(RESULT_FOLDER, String.format("10948-new-%s-%s.%s", page, index, image.getSuffix()));
                        ImageIO.write(image.getImage(), image.getSuffix(), file);
                        index++;
                    }
                }
            }
            page++;
        }
    }
}
 
Example 8
Source File: PdfCompressImagesBatchController.java    From MyBox with Apache License 2.0 4 votes vote down vote up
@Override
public int handleCurrentPage() {
    int count = 0;
    try {
        PDPage sourcePage = doc.getPage(currentParameters.currentPage - 1);  // 0-based
        PDResources pdResources = sourcePage.getResources();
        pdResources.getXObjectNames();
        Iterable<COSName> iterable = pdResources.getXObjectNames();
        if (iterable == null) {
            return 0;
        }
        Iterator<COSName> pageIterator = iterable.iterator();
        while (pageIterator.hasNext()) {
            if (task.isCancelled()) {
                break;
            }
            COSName cosName = pageIterator.next();
            if (!pdResources.isImageXObject(cosName)) {
                continue;
            }
            PDImageXObject pdxObject = (PDImageXObject) pdResources.getXObject(cosName);
            BufferedImage sourceImage = pdxObject.getImage();
            PDImageXObject newObject = null;
            if (format == PdfImageFormat.Tiff) {
                ImageBinary imageBinary = new ImageBinary(sourceImage, threshold);
                imageBinary.setIsDithering(ditherCheck.isSelected());
                BufferedImage newImage = imageBinary.operate();
                newImage = ImageBinary.byteBinary(newImage);
                newObject = CCITTFactory.createFromImage(doc, newImage);

            } else if (format == PdfImageFormat.Jpeg) {
                newObject = JPEGFactory.createFromImage(doc, sourceImage, jpegQuality / 100f);
            }
            if (newObject != null) {
                pdResources.put(cosName, newObject);
                count++;
            }
            if (isPreview) {
                break;
            }
        }
        if (copyAllCheck.isSelected()) {
            targetDoc.getPage(currentParameters.currentPage - 1).setResources(pdResources);
        } else {
            targetDoc.addPage(sourcePage);
        }
    } catch (Exception e) {
        logger.error(e.toString());
    }
    return count;

}
 
Example 9
Source File: LayerUtility.java    From gcs with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Imports a page from some PDF file as a Form XObject so it can be placed on another page
 * in the target document.
 * <p>
 * You may want to call {@link #wrapInSaveRestore(PDPage) wrapInSaveRestore(PDPage)} before invoking the Form XObject to
 * make sure that the graphics state is reset.
 * 
 * @param sourceDoc the source PDF document that contains the page to be copied
 * @param page the page in the source PDF document to be copied
 * @return a Form XObject containing the original page's content
 * @throws IOException if an I/O error occurs
 */
public PDFormXObject importPageAsForm(PDDocument sourceDoc, PDPage page) throws IOException
{
    importOcProperties(sourceDoc);

    PDStream newStream = new PDStream(targetDoc, page.getContents(), COSName.FLATE_DECODE);
    PDFormXObject form = new PDFormXObject(newStream);

    //Copy resources
    PDResources pageRes = page.getResources();
    PDResources formRes = new PDResources();
    cloner.cloneMerge(pageRes, formRes);
    form.setResources(formRes);

    //Transfer some values from page to form
    transferDict(page.getCOSObject(), form.getCOSObject(), PAGE_TO_FORM_FILTER, true);

    Matrix matrix = form.getMatrix();
    AffineTransform at = matrix.createAffineTransform();
    PDRectangle mediaBox = page.getMediaBox();
    PDRectangle cropBox = page.getCropBox();
    PDRectangle viewBox = (cropBox != null ? cropBox : mediaBox);

    //Handle the /Rotation entry on the page dict
    int rotation = page.getRotation();

    //Transform to FOP's user space
    //at.scale(1 / viewBox.getWidth(), 1 / viewBox.getHeight());
    at.translate(mediaBox.getLowerLeftX() - viewBox.getLowerLeftX(),
            mediaBox.getLowerLeftY() - viewBox.getLowerLeftY());
    switch (rotation)
    {
    case 90:
        at.scale(viewBox.getWidth() / viewBox.getHeight(), viewBox.getHeight() / viewBox.getWidth());
        at.translate(0, viewBox.getWidth());
        at.rotate(-Math.PI / 2.0);
        break;
    case 180:
        at.translate(viewBox.getWidth(), viewBox.getHeight());
        at.rotate(-Math.PI);
        break;
    case 270:
        at.scale(viewBox.getWidth() / viewBox.getHeight(), viewBox.getHeight() / viewBox.getWidth());
        at.translate(viewBox.getHeight(), 0);
        at.rotate(-Math.PI * 1.5);
        break;
    default:
        //no additional transformations necessary
    }
    //Compensate for Crop Boxes not starting at 0,0
    at.translate(-viewBox.getLowerLeftX(), -viewBox.getLowerLeftY());
    if (!at.isIdentity())
    {
        form.setMatrix(at);
    }

    BoundingBox bbox = new BoundingBox();
    bbox.setLowerLeftX(viewBox.getLowerLeftX());
    bbox.setLowerLeftY(viewBox.getLowerLeftY());
    bbox.setUpperRightX(viewBox.getUpperRightX());
    bbox.setUpperRightY(viewBox.getUpperRightY());
    form.setBBox(new PDRectangle(bbox));

    return form;
}
 
Example 10
Source File: RenderType3Character.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
     * <a href="http://stackoverflow.com/questions/42032729/render-type3-font-character-as-image-using-pdfbox">
     * Render Type3 font character as image using PDFBox
     * </a>
     * <br/>
     * <a href="https://drive.google.com/file/d/0B0f6X4SAMh2KRDJTbm4tb3E1a1U/view">
     * 4700198773.pdf
     * </a>
     * from
     * <a href="http://stackoverflow.com/questions/37754112/extract-text-with-custom-font-result-non-readble">
     * extract text with custom font result non readble
     * </a>
     * <p>
     * This test shows how one can render individual Type 3 font glyphs as bitmaps.
     * Unfortunately PDFBox out-of-the-box does not provide a class to render contents
     * of arbitrary XObjects, merely for rendering pages; thus, we simply create a page
     * with the glyph in question and render that page.   
     * </p>
     * <p>
     * As the OP did not provide a sample PDF, we simply use one from another
     * stackoverflow question. There obviously might remain issues with the
     * OP's files.
     * </p>
     */
    @Test
    public void testRender4700198773() throws IOException, NoSuchMethodException, SecurityException, IllegalAccessException, IllegalArgumentException, InvocationTargetException
    {
        Method PDPageContentStreamWrite = PDPageContentStream.class.getSuperclass().getDeclaredMethod("write", String.class);
        PDPageContentStreamWrite.setAccessible(true);

        try (   InputStream resource = getClass().getResourceAsStream("4700198773.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);

            PDPage page = document.getPage(0);
            PDResources pageResources = page.getResources();
            COSName f1Name = COSName.getPDFName("F1");
            PDType3Font fontF1 = (PDType3Font) pageResources.getFont(f1Name);
            Map<String, Integer> f1NameToCode = fontF1.getEncoding().getNameToCodeMap();

            COSDictionary charProcsDictionary = fontF1.getCharProcs();
            for (COSName key : charProcsDictionary.keySet())
            {
                COSStream stream = (COSStream) charProcsDictionary.getDictionaryObject(key);
                PDType3CharProc charProc = new PDType3CharProc(fontF1, stream);
                PDRectangle bbox = charProc.getGlyphBBox();
                if (bbox == null)
                    bbox = charProc.getBBox();
                Integer code = f1NameToCode.get(key.getName());

                if (code != null)
                {
                    PDDocument charDocument = new PDDocument();
                    PDPage charPage = new PDPage(bbox);
                    charDocument.addPage(charPage);
                    charPage.setResources(pageResources);
                    PDPageContentStream charContentStream = new PDPageContentStream(charDocument, charPage);
                    charContentStream.beginText();
                    charContentStream.setFont(fontF1, bbox.getHeight());
//                    charContentStream.write(String.format("<%2X> Tj\n", code).getBytes());
                    PDPageContentStreamWrite.invoke(charContentStream, String.format("<%2X> Tj\n", code));
                    charContentStream.endText();
                    charContentStream.close();

                    File result = new File(RESULT_FOLDER, String.format("4700198773-%s-%s.png", key.getName(), code));
                    PDFRenderer renderer = new PDFRenderer(charDocument);
                    BufferedImage image = renderer.renderImageWithDPI(0, 96);
                    ImageIO.write(image, "PNG", result);
                    charDocument.save(new File(RESULT_FOLDER, String.format("4700198773-%s-%s.pdf", key.getName(), code)));
                    charDocument.close();
                }
            }
        }
    }
 
Example 11
Source File: RenderType3Character.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/42032729/render-type3-font-character-as-image-using-pdfbox">
 * Render Type3 font character as image using PDFBox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B0f6X4SAMh2KRDJTbm4tb3E1a1U/view">
 * 4700198773.pdf
 * </a>
 * from
 * <a href="http://stackoverflow.com/questions/37754112/extract-text-with-custom-font-result-non-readble">
 * extract text with custom font result non readble
 * </a>
 * <p>
 * This test shows how one can render individual Type 3 font glyphs as bitmaps.
 * Unfortunately PDFBox out-of-the-box does not provide a class to render contents
 * of arbitrary XObjects, merely for rendering pages; thus, we simply create a page
 * with the glyph in question and render that page.   
 * </p>
 * <p>
 * As the OP did not provide a sample PDF, we simply use one from another
 * stackoverflow question. There obviously might remain issues with the
 * OP's files.
 * </p>
 */
@Test
public void testRenderSdnList() throws IOException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException
{
    Method PDPageContentStreamWrite = PDPageContentStream.class.getSuperclass().getDeclaredMethod("write", String.class);
    PDPageContentStreamWrite.setAccessible(true);

    try (   InputStream resource = getClass().getResourceAsStream("sdnlist.pdf"))
    {
        PDDocument document = Loader.loadPDF(resource);

        PDPage page = document.getPage(1);
        PDResources pageResources = page.getResources();
        COSName f1Name = COSName.getPDFName("R144");
        PDType3Font fontF1 = (PDType3Font) pageResources.getFont(f1Name);
        Map<String, Integer> f1NameToCode = fontF1.getEncoding().getNameToCodeMap();

        COSDictionary charProcsDictionary = fontF1.getCharProcs();
        for (COSName key : charProcsDictionary.keySet())
        {
            COSStream stream = (COSStream) charProcsDictionary.getDictionaryObject(key);
            PDType3CharProc charProc = new PDType3CharProc(fontF1, stream);
            PDRectangle bbox = charProc.getGlyphBBox();
            if (bbox == null)
                bbox = charProc.getBBox();
            Integer code = f1NameToCode.get(key.getName());

            if (code != null)
            {
                PDDocument charDocument = new PDDocument();
                PDPage charPage = new PDPage(bbox);
                charDocument.addPage(charPage);
                charPage.setResources(pageResources);
                PDPageContentStream charContentStream = new PDPageContentStream(charDocument, charPage);
                charContentStream.beginText();
                charContentStream.setFont(fontF1, bbox.getHeight());
                //charContentStream.getOutputStream().write(String.format("<%2X> Tj\n", code).getBytes());
                PDPageContentStreamWrite.invoke(charContentStream, String.format("<%2X> Tj\n", code));
                charContentStream.endText();
                charContentStream.close();

                File result = new File(RESULT_FOLDER, String.format("sdnlist-%s-%s.png", key.getName(), code));
                PDFRenderer renderer = new PDFRenderer(charDocument);
                BufferedImage image = renderer.renderImageWithDPI(0, 96);
                ImageIO.write(image, "PNG", result);
                charDocument.save(new File(RESULT_FOLDER, String.format("sdnlist-%s-%s.pdf", key.getName(), code)));
                charDocument.close();
            }
        }
    }
}