org.apache.pdfbox.pdmodel.PDPageTree Java Examples

The following examples show how to use org.apache.pdfbox.pdmodel.PDPageTree. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PdfTools.java    From MyBox with Apache License 2.0 6 votes vote down vote up
public static List<PDImageXObject> getImageListFromPDF(PDDocument document,
        Integer startPage) throws Exception {
    List<PDImageXObject> imageList = new ArrayList<>();
    if (null != document) {
        PDPageTree pages = document.getPages();
        startPage = startPage == null ? 0 : startPage;
        int len = pages.getCount();
        if (startPage < len) {
            for (int i = startPage; i < len; ++i) {
                PDPage page = pages.get(i);
                Iterable<COSName> objectNames = page.getResources().getXObjectNames();
                for (COSName imageObjectName : objectNames) {
                    if (page.getResources().isImageXObject(imageObjectName)) {
                        imageList.add((PDImageXObject) page.getResources().getXObject(imageObjectName));
                    }
                }
            }
        }
    }
    return imageList;
}
 
Example #2
Source File: PDPageDestination.java    From gcs with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Returns the page number for this destination, regardless of whether this is a page number or
 * a reference to a page.
 *
 * @since Apache PDFBox 1.0.0
 * @see org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
 * @return page number, or -1 if the destination type is unknown. The page number is 0-based if
 * it was in the dictionary (for remote destinations), and 1-based if it was computed from a
 * page reference (for local destinations).
 * @deprecated This method has inconsistent behavior (see returns), use {@link #retrievePageNumber()} instead.
 */
@Deprecated
public int findPageNumber()
{
    int retval = -1;
    if( array.size() > 0 )
    {
        COSBase page = array.getObject( 0 );
        if( page instanceof COSNumber )
        {
            retval = ((COSNumber)page).intValue();
        }
        else if (page instanceof COSDictionary)
        {
            COSBase parent = page;
            while (((COSDictionary) parent).getDictionaryObject(COSName.PARENT, COSName.P) != null)
            {
                parent = ((COSDictionary) parent).getDictionaryObject(COSName.PARENT, COSName.P);
            }
            // now parent is the pages node
            PDPageTree pages = new PDPageTree((COSDictionary) parent);
            return pages.indexOf(new PDPage((COSDictionary) page)) + 1;
        }
    }
    return retval;
}
 
Example #3
Source File: PdfScreenshotUtils.java    From dss with GNU Lesser General Public License v2.1 6 votes vote down vote up
public static void checkPdfSimilarity(PDDocument document1, PDDocument document2, float minSimilarity) throws IOException {
	PDPageTree samplePageTree = document1.getPages();
	PDPageTree checkPageTree = document2.getPages();

	assertEquals(checkPageTree.getCount(), samplePageTree.getCount());

	PDFRenderer sampleRenderer = new PDFRenderer(document1);
	PDFRenderer checkRenderer = new PDFRenderer(document2);

	for (int pageNumber = 0; pageNumber < checkPageTree.getCount(); pageNumber++) {
		BufferedImage sampleImage = sampleRenderer.renderImageWithDPI(pageNumber, DPI);
		BufferedImage checkImage = checkRenderer.renderImageWithDPI(pageNumber, DPI);
		
           // ImageIO.write(sampleImage, "png", new File("target\\sampleImage.png"));
           // ImageIO.write(checkImage, "png", new File("target\\checkImage.png"));
           
		float checkSimilarity = checkImageSimilarity(sampleImage, checkImage, CHECK_RESOLUTION);
		assertTrue(checkSimilarity >= minSimilarity, "The image similarity " + checkSimilarity + " is lower the allowed limit " + minSimilarity);
	}
}
 
Example #4
Source File: PdfContentImagePreprocessor.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void removeImagesAlphaChannelUnsafe() {
    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            processImagesFromResources(page.getResources());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #5
Source File: PdfContentTypeChecker.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void calculateObjectsInDocument(PDDocument document) throws IOException {
    this.pdfTextStripper = new PDFTextStripper();

    try {
        PDPageTree allPages = document.getDocumentCatalog().getPages();
        this.pageCount = allPages.getCount();
        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            readObjectsOnPage(page);
            calculateTextLengthOnPage(document, i + 1);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}
 
Example #6
Source File: DashboardUtil.java    From Insights with Apache License 2.0 5 votes vote down vote up
/**
 * Footer is filled with varaibles selected in Grafana by user
 * 
 * @param doc
 * @param title
 * @param variables
 * @return doc
 * @throws IOException
 */
private PDDocument footer(PDDocument doc, String title, String variables) throws IOException {
	try{
		PDPageTree pages = doc.getPages();
		for(PDPage p : pages){
			PDPageContentStream contentStream = new PDPageContentStream(doc, p, AppendMode.APPEND, false);
			contentStream.beginText();
			contentStream.newLineAtOffset(220, 780);
			contentStream.setFont(PDType1Font.HELVETICA, 11);
			contentStream.showText("OneDevOps Insights – "+title);
			contentStream.endText();
			if(!variables.equals("") && variables != null){
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 17);
				contentStream.setFont(PDType1Font.HELVETICA, 9);
				contentStream.showText("This Report is generated based on the user selected values as below.");
				contentStream.endText();
				contentStream.beginText();
				contentStream.newLineAtOffset(2, 5);
				contentStream.setFont(PDType1Font.HELVETICA, 7);
				contentStream.showText(variables);
				contentStream.endText();
			}
			contentStream.close();
		}
	}catch(Exception e){
		Log.error("Error, Failed in Footer.. ", e.getMessage());
	}
	return doc;
}
 
Example #7
Source File: DetermineWidgetPage.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
int determineSafe(PDDocument document, PDAnnotationWidget widget) throws IOException
{
    COSDictionary widgetObject = widget.getCOSObject();
    PDPageTree pages = document.getPages();
    for (int i = 0; i < pages.getCount(); i++)
    {
        for (PDAnnotation annotation : pages.get(i).getAnnotations())
        {
            COSDictionary annotationObject = annotation.getCOSObject();
            if (annotationObject.equals(widgetObject))
                return i;
        }
    }
    return -1;
}
 
Example #8
Source File: RemoveStrikeoutComment.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * Due to a bug in the <code>COSArrayList</code> usage for page annotations,
 * the indirect reference to the annotation in question is not removed from
 * the actual page annotations array.
 * </p>
 */
@Test
public void testRemoveLikeStephan() throws IOException {
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            List<PDAnnotation> annotationToRemove = new ArrayList<PDAnnotation>();

            if (annotations.size() < 1)
                continue;
            else {
                for (PDAnnotation annotation : annotations) {

                    if (annotation.getContents() != null
                            && annotation.getContents().equals("Sample Strikethrough")) {
                        annotationToRemove.add(annotation);
                    }
                }
                annotations.removeAll(annotationToRemove);
            }
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeLikeStephan.pdf"));
    }
}
 
Example #9
Source File: RemoveStrikeoutComment.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/45812696/pdfbox-delete-comment-maintain-strikethrough">
 * PDFBox delete comment maintain strikethrough
 * </a>
 * <br/>
 * <a href="https://expirebox.com/files/3d955e6df4ca5874c38dbf92fc43b5af.pdf">
 * only_fields.pdf
 * </a>
 * <a href="https://file.io/DTvqhC">
 * (alternative download)
 * </a>
 * <p>
 * The OP only wanted the comment removed, not the strike-through. Thus, we must
 * not remove the annotation but merely the comment building attributes.
 * </p>
 */
@Test
public void testRemoveLikeStephanImproved() throws IOException {
    final COSName POPUP = COSName.getPDFName("Popup");
    try (InputStream resource = getClass().getResourceAsStream("only_fields.pdf")) {
        PDDocument document = Loader.loadPDF(resource);
        List<PDAnnotation> annotations = new ArrayList<>();
        PDPageTree allPages = document.getDocumentCatalog().getPages();

        List<COSObjectable> objectsToRemove = new ArrayList<>();

        for (int i = 0; i < allPages.getCount(); i++) {
            PDPage page = allPages.get(i);
            annotations = page.getAnnotations();

            for (PDAnnotation annotation : annotations) {
                if ("StrikeOut".equals(annotation.getSubtype()))
                {
                    COSDictionary annotationDict = annotation.getCOSObject();
                    COSBase popup = annotationDict.getItem(POPUP);
                    annotationDict.removeItem(POPUP);
                    annotationDict.removeItem(COSName.CONTENTS); // plain text comment
                    annotationDict.removeItem(COSName.RC);       // rich text comment
                    annotationDict.removeItem(COSName.T);        // author

                    if (popup != null)
                        objectsToRemove.add(popup);
                }
            }

            annotations.removeAll(objectsToRemove);
        }

        document.save(new File(RESULT_FOLDER, "only_fields-removeImproved.pdf"));
    }
}
 
Example #10
Source File: ShrinkPDF.java    From shrink-pdf with MIT License 5 votes vote down vote up
/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}
 
Example #11
Source File: PDDocumentCatalogBleach.java    From DocBleach with MIT License 4 votes vote down vote up
private void sanitizePageActions(PDPageTree pages) throws IOException {
  LOGGER.trace("Checking Pages Actions");
  for (PDPage page : pages) {
    sanitizePage(page);
  }
}
 
Example #12
Source File: PDFTextStripper.java    From gcs with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * This will process all of the pages and the text that is in them.
 *
 * @param pages The pages object in the document.
 *
 * @throws IOException If there is an error parsing the text.
 */
protected void processPages(PDPageTree pages) throws IOException
{
    PDPage startBookmarkPage = startBookmark == null ? null
            : startBookmark.findDestinationPage(document);
    if (startBookmarkPage != null)
    {
        startBookmarkPageNumber = pages.indexOf(startBookmarkPage) + 1;
    }
    else
    {
        // -1 = undefined
        startBookmarkPageNumber = -1;
    }

    PDPage endBookmarkPage = endBookmark == null ? null
            : endBookmark.findDestinationPage(document);
    if (endBookmarkPage != null)
    {
        endBookmarkPageNumber = pages.indexOf(endBookmarkPage) + 1;
    }
    else
    {
        // -1 = undefined
        endBookmarkPageNumber = -1;
    }

    if (startBookmarkPageNumber == -1 && startBookmark != null && endBookmarkPageNumber == -1
            && endBookmark != null
            && startBookmark.getCOSObject() == endBookmark.getCOSObject())
    {
        // this is a special case where both the start and end bookmark
        // are the same but point to nothing. In this case
        // we will not extract any text.
        startBookmarkPageNumber = 0;
        endBookmarkPageNumber = 0;
    }

    for (PDPage page : pages)
    {
        currentPageNo++;
        if (page.hasContents())
        {
            processPage(page);
        }
    }
}
 
Example #13
Source File: DashboardUtil.java    From Insights with Apache License 2.0 2 votes vote down vote up
/**
 * Get previous page in the document.
 * 
 * @param document
 * @return {pageNum}
 */
private static int getPages(PDDocument document) {
	PDPageTree pages = document.getPages();
	return pages.getCount()-1;
}