Java Code Examples for org.apache.pdfbox.pdmodel.PDPageTree#get()

The following examples show how to use org.apache.pdfbox.pdmodel.PDPageTree#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PdfTools.java    From MyBox with Apache License 2.0 6 votes vote down vote up
public static List<PDImageXObject> getImageListFromPDF(PDDocument document,
        Integer startPage) throws Exception {
    List<PDImageXObject> imageList = new ArrayList<>();
    if (null != document) {
        PDPageTree pages = document.getPages();
        startPage = startPage == null ? 0 : startPage;
        int len = pages.getCount();
        if (startPage < len) {
            for (int i = startPage; i < len; ++i) {
                PDPage page = pages.get(i);
                Iterable<COSName> objectNames = page.getResources().getXObjectNames();
                for (COSName imageObjectName : objectNames) {
                    if (page.getResources().isImageXObject(imageObjectName)) {
                        imageList.add((PDImageXObject) page.getResources().getXObject(imageObjectName));
                    }
                }
            }
        }
    }
    return imageList;
}
 
Example 2
Source File: PdfContentImagePreprocessor.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void removeImagesAlphaChannelUnsafe() {
    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            processImagesFromResources(page.getResources());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 3
Source File: PdfContentTypeChecker.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void calculateObjectsInDocument(PDDocument document) throws IOException {
    this.pdfTextStripper = new PDFTextStripper();

    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        this.pageCount = allPages.getCount();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            readObjectsOnPage(page);
            calculateTextLengthOnPage(document, i + 1);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example 4
Source File: RemoveStrikeoutComment.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * Due to a bug in the <code>COSArrayList</code> usage for page annotations,
 * the indirect reference to the annotation in question is not removed from
 * the actual page annotations array.
 * </p>
 */
@Test
public void testRemoveLikeStephan() throws IOException {
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            List<PDAnnotation> annotationToRemove = new ArrayList<PDAnnotation>();

            if (annotations.size() < 1)
                continue;
            else {
                for (PDAnnotation annotation : annotations) {

                    if (annotation.getContents() != null
                            && annotation.getContents().equals("Sample Strikethrough")) {
                        annotationToRemove.add(annotation);
                    }
                }
                annotations.removeAll(annotationToRemove);
            }
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeLikeStephan.pdf"));
    }
}
 
Example 5
Source File: RemoveStrikeoutComment.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * The OP only wanted the comment removed, not the strike-through. Thus, we must
 * not remove the annotation but merely the comment building attributes.
 * </p>
 */
@Test
public void testRemoveLikeStephanImproved() throws IOException {
    final COSName POPUP = COSName.getPDFName("Popup");
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        List<COSObjectable> objectsToRemove = new ArrayList<>();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            for (PDAnnotation annotation : annotations) {
                if ("StrikeOut".equals(annotation.getSubtype()))
                {
                    COSDictionary annotationDict = annotation.getCOSObject();
                    COSBase popup = annotationDict.getItem(POPUP);
                    annotationDict.removeItem(POPUP);
                    annotationDict.removeItem(COSName.CONTENTS); // plain text comment
                    annotationDict.removeItem(COSName.RC);       // rich text comment
                    annotationDict.removeItem(COSName.T);        // author

                    if (popup != null)
                        objectsToRemove.add(popup);
                }
            }

            annotations.removeAll(objectsToRemove);
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeImproved.pdf"));
    }
}