org.apache.pdfbox.pdmodel.PDDocument#getNumberOfPages

Source File: PdfBoxConverter.java From workable-converter with GNU General Public License v3.0

8 votes

@Override
public boolean byFileFolder(ConvertRequire require) throws ConvertFailedException {
    logger.info("pdfbox convert by file folder begin(src file must be a pdf file) :");
    try {
        File srcFile = new File(require.getWaitingFilePath());
        PDDocument document = PDDocument.load(srcFile);
        PDFRenderer renderer = new PDFRenderer(document);
        for (int page = 0; page < document.getNumberOfPages(); page++) {
            BufferedImage image = renderer.renderImageWithDPI(page, 300, ImageType.RGB);
            String savename = require.getDestConvertedPath() + "/" + SuffixTool.deleteSuffix(srcFile.getName()) + "_" + page + ".png";
            ImageIOUtil.writeImage(image, savename, 300);
        }
        document.close();
    } catch (Exception e) {
        throw new ConvertFailedException(e.getMessage());
    }
    logger.info("pdf box convert by filepath success");
    return true;
}

Source File: PdfBoxConverter.java From workable-converter with GNU General Public License v3.0

6 votes

/**
 * convert a pdf base64 file to png pics
 */
@Override
public boolean byBase64 (ConvertRequire require) throws ConvertFailedException {
    logger.info("pdfbox convert by base64 begin(src file must be a pdf file):");
    try {
        String srcFileTmpName = StrRandomTool.getUuid(true) + ".pdf";
        Base64FileTool.saveBase64File(require.getSrcBase64(), params.getTmpPath() + "/" + srcFileTmpName);
        File srcFile = new File(params.getTmpPath() + "/" + srcFileTmpName);
        PDDocument document = PDDocument.load(srcFile);
        PDFRenderer renderer = new PDFRenderer(document);
        List<String> results = new LinkedList<>();
        for (int page = 0; page < document.getNumberOfPages(); page++) {
            BufferedImage image = renderer.renderImageWithDPI(page, 300, ImageType.RGB);
            String savePath = params.getTmpPath() + "/" + SuffixTool.deleteSuffix(srcFile.getName()) + "_" + page + ".png";
            ImageIOUtil.writeImage(image, savePath, 300);
            results.add(Base64FileTool.filePathToBase64(savePath));
        }
        require.setDestBase64s(results);
        document.close();
    } catch (Exception e) {
        throw new ConvertFailedException(e.getMessage());
    }
    logger.info("pdf box convert by base64 success");
    return true;
}

Source File: PdfBoxUtilities.java From tess4j with Apache License 2.0

6 votes

/**
 * Gets PDF Page Count.
 *
 * @param inputPdfFile input file
 * @return number of pages
 */
public static int getPdfPageCount(File inputPdfFile) {
    PDDocument document = null;
    try {
        document = PDDocument.load(inputPdfFile);
        return document.getNumberOfPages();
    } catch (IOException ioe) {
        logger.error("Error counting PDF pages => " + ioe);
        return - 1;
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
            }
        }
    }
}

Source File: PdfComparator.java From pdfcompare with Apache License 2.0

6 votes

private void addExtraPages(final PDDocument document, final PDFRenderer pdfRenderer, final int minPageCount,
        final int color, final boolean expected) throws IOException {
    for (int pageIndex = minPageCount; pageIndex < document.getNumberOfPages(); pageIndex++) {
        ImageWithDimension image = renderPageAsImage(document, pdfRenderer, pageIndex, environment);
        final DataBuffer dataBuffer = image.bufferedImage.getRaster().getDataBuffer();
        for (int i = 0; i < image.bufferedImage.getWidth() * MARKER_WIDTH; i++) {
            dataBuffer.setElem(i, color);
        }
        for (int i = 0; i < image.bufferedImage.getHeight(); i++) {
            for (int j = 0; j < MARKER_WIDTH; j++) {
                dataBuffer.setElem(i * image.bufferedImage.getWidth() + j, color);
            }
        }
        if (expected) {
            compareResult.addPage(new PageDiffCalculator(new PageArea(pageIndex + 1)), pageIndex, image, blank(image), image);
        } else {
            compareResult.addPage(new PageDiffCalculator(new PageArea(pageIndex + 1)), pageIndex, blank(image), image, image);
        }
    }
}

Source File: PdfComparator.java From pdfcompare with Apache License 2.0

6 votes

private void compare(final PDDocument expectedDocument, final PDDocument actualDocument) throws IOException {
    expectedDocument.setResourceCache(new ResourceCacheWithLimitedImages(environment));
    PDFRenderer expectedPdfRenderer = new PDFRenderer(expectedDocument);

    actualDocument.setResourceCache(new ResourceCacheWithLimitedImages(environment));
    PDFRenderer actualPdfRenderer = new PDFRenderer(actualDocument);

    final int minPageCount = Math.min(expectedDocument.getNumberOfPages(), actualDocument.getNumberOfPages());
    CountDownLatch latch = new CountDownLatch(minPageCount);
    for (int pageIndex = 0; pageIndex < minPageCount; pageIndex++) {
        drawImage(latch, pageIndex, expectedDocument, actualDocument, expectedPdfRenderer, actualPdfRenderer);
    }
    Utilities.await(latch, "FullCompare", environment);
    Utilities.shutdownAndAwaitTermination(drawExecutor, "Draw");
    Utilities.shutdownAndAwaitTermination(parrallelDrawExecutor, "Parallel Draw");
    Utilities.shutdownAndAwaitTermination(diffExecutor, "Diff");
    if (expectedDocument.getNumberOfPages() > minPageCount) {
        addExtraPages(expectedDocument, expectedPdfRenderer, minPageCount, environment.getActualColor().getRGB(), true);
    } else if (actualDocument.getNumberOfPages() > minPageCount) {
        addExtraPages(actualDocument, actualPdfRenderer, minPageCount, environment.getExpectedColor().getRGB(), false);
    }
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

6 votes

/**
 * <a href="https://stackoverflow.com/questions/45895768/pdfbox-2-0-7-extracttext-not-working-but-1-8-13-does-and-pdfreader-as-well">
 * PDFBox 2.0.7 ExtractText not working but 1.8.13 does and PDFReader as well
 * </a>
 * <br/>
 * <a href="https://wetransfer.com/downloads/214674449c23713ee481c5a8f529418320170827201941/b2bea6">
 * test-2.pdf
 * </a>
 * <p>
 * Due to the broken <b>ToUnicode</b> maps the output of immediate text
 * extraction from this document is unsatisfying, cf. {@link #testTest2()}.
 * It can be improved by removing these <b>ToUnicode</b> maps as this test
 * shows.
 * </p>
 */
@Test
public void testNoToUnicodeTest2() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("test-2.pdf")    )
    {
        PDDocument document = Loader.loadPDF(resource);

        for (int pageNr = 0; pageNr < document.getNumberOfPages(); pageNr++)
        {
            PDPage page = document.getPage(pageNr);
            PDResources resources = page.getResources();
            removeToUnicodeMaps(resources);
        }

        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.printf("\n*\n* test-2.pdf without ToUnicode\n*\n%s\n", text);
        Files.write(new File(RESULT_FOLDER, "test-2_NoToUnicode.txt").toPath(), Collections.singleton(text));
    }
}

Source File: ImageExtractor.java From inception with Apache License 2.0

6 votes

static void processFile(File inFile, int dpi, String outDir) throws IOException
{
    PDDocument doc = PDDocument.load(inFile);
    String baseName = inFile.getName().substring(0, inFile.getName().lastIndexOf("."));
    try {
        RegionExtractor regionExt = new RegionExtractor(doc, dpi);
        int count = 1;
        for (int pageIndex = 0; pageIndex < doc.getNumberOfPages(); pageIndex++) {
            for (ImageOperator op : ImageExtractor.extract(doc.getPage(pageIndex))) {
                RenderedImage image = regionExt.extract(pageIndex, op.x, op.y, op.w, op.h);
                String outFileName = baseName + "_" + String.valueOf(count) + ".png";
                ImageIO.write(image, "png", new File(outDir, outFileName));
                System.out.println(outFileName + " is saved.");
                count++;
            }
        }
    }
    finally {
        doc.close();
    }
}

Source File: ConvertTest.java From blog-codes with Apache License 2.0

6 votes

public static void main(String[] args) throws InvalidPasswordException, IOException {
		PDDocument document = PDDocument.load(new File("/home/lili/data/testen.pdf"));
		PDFRenderer pdfRenderer = new PDFRenderer(document);
		for (int page = 0; page < document.getNumberOfPages(); ++page) {
			if(page>0 && page %100==0) {
				System.out.println("page: "+page);
			}
//			float w=document.getPage(page).getMediaBox().getWidth();
//			float h=document.getPage(page).getMediaBox().getHeight();
//			System.out.println(String.format("w: %f, h: %f",w, h));
			BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
			//BufferedImage bim = pdfRenderer.renderImage(page, 2.0f);
			
			// suffix in filename will be used as the file format
			ImageIOUtil.writeImage(bim, "/home/lili/data/testen-" + (page + 1) + ".png", 300);
		}
	}

Source File: WaterMarkConverter.java From workable-converter with GNU General Public License v3.0

6 votes

@Override
public boolean byStream(ConvertRequire require) throws ConvertFailedException {
    try {
        PDDocument pdfFile = PDDocument.load(require.getSrcStream());
        HashMap<Integer, String> overlayGuide = new HashMap<>();

        String tmpName = this.getTmpName(require.getWaterMarkRequire());
        //0 means add watermark in all page
        if (require.getWaterMarkRequire().getWaterMarkPage() == 0) {
            for (int i = 0; i < pdfFile.getNumberOfPages(); i++) {
                overlayGuide.put(i + 1, tmpName);
            }
        } else {
            overlayGuide.put(require.getWaterMarkRequire().getWaterMarkPage(), tmpName);
        }
        Overlay overlay = new Overlay();
        overlay.setInputPDF(pdfFile);
        overlay.setOverlayPosition(Overlay.Position.BACKGROUND);
        overlay.overlay(overlayGuide);
        pdfFile.save(require.getDestStream());
    } catch (IOException e) {
        throw new ConvertFailedException(e.getMessage());
    }
    return true;
}

Source File: PDFPresentation.java From Quelea with GNU General Public License v3.0

5 votes

/**
 * Make the slides that go in this PDF, this is what takes time and should
 * only be done once.
 *
 * @return all the slides.
 */
private PdfSlide[] makeSlides() throws IOException {
    File pdf = new File(file);
    PDDocument document = PDDocument.load(pdf.getAbsoluteFile());
    Path f = Files.createTempDirectory(null);
    f.toFile().deleteOnExit();
    ArrayList<PdfSlide> ret = new ArrayList<>();
    PDFRenderer pdfRenderer = new PDFRenderer(document);
    int totalPages = document.getNumberOfPages();
    for (int i = 0; i < totalPages; i++) {
        ret.add(new PdfSlide(i + 1, pdfRenderer));
    }
    document.close();
    return ret.toArray(new PdfSlide[ret.size()]);
}

Source File: SurvivorSongbookParser.java From Quelea with GNU General Public License v3.0

5 votes

/**
 * Get all the songs in the PDF document.
 * @return a list of all the songs.
 * @throws IOException if something went wrong.
 */
@Override
public List<SongDisplayable> getSongs(File location, StatusPanel statusPanel) throws IOException {
    PDDocument document = PDDocument.load(location);
    List<SongDisplayable> pdfSongs = new ArrayList<>();
    PDFTextStripper stripper = new PDFTextStripper();
    List<String> songParts = new ArrayList<>();
    for (int i = 0; i < document.getNumberOfPages(); i++) {
        String pageText = getPageText(document, stripper, i);
        if (pageText.trim().isEmpty()) {
            continue;
        }
        songParts.add(pageText);
        boolean twoPart = pageText.contains("(1 of");
        if (i < document.getNumberOfPages() - 1) { //This section in case the original (1 of x) is missed out
            String nextPageText = getPageText(document, stripper, i + 1);
            if (nextPageText.contains("(2 of")) {
                twoPart = true;
            }
        }
        if (!twoPart) {
            SongDisplayable song = processSong(songParts.toArray(new String[songParts.size()]));
            if (song != null) {
                pdfSongs.add(song);
            }
            songParts.clear();
        }
    }
    document.close();
    if (pdfSongs == null) {
        return new ArrayList<>();
    }
    else {
        return pdfSongs;
    }
}

Source File: PDFCreator.java From Knowage-Server with GNU Affero General Public License v3.0

5 votes

private static void writePageNumbering(PDDocument doc, PDFont font, float fontSize, PageNumbering pageNumbering) throws IOException {
	int totalPages = doc.getNumberOfPages();
	int numberOfPages = pageNumbering.isLastIncluded() ? doc.getNumberOfPages() : doc.getNumberOfPages() - 1;
	for (int pageIndex = pageNumbering.isFirstIncluded() ? 0 : 1; pageIndex < numberOfPages; pageIndex++) {
		String footer = "Page " + (pageIndex + 1) + " of " + totalPages;
		PDPage page = doc.getPage(pageIndex);
		PDRectangle pageSize = page.getMediaBox();
		float stringWidth = font.getStringWidth(footer) * fontSize / 1000f;
		float stringHeight = font.getFontDescriptor().getFontBoundingBox().getHeight() * fontSize / 1000f;

		int rotation = page.getRotation();
		boolean rotate = rotation == 90 || rotation == 270;
		float pageWidth = rotate ? pageSize.getHeight() : pageSize.getWidth();
		float pageHeight = rotate ? pageSize.getWidth() : pageSize.getHeight();
		float startX = rotate ? pageHeight / 2f : (pageWidth - stringWidth - stringHeight) / 2f;
		float startY = rotate ? (pageWidth - stringWidth) : stringHeight;

		// append the content to the existing stream
		try (PDPageContentStream contentStream = new PDPageContentStream(doc, page, AppendMode.APPEND, true, true)) {

			// draw rectangle
			contentStream.setNonStrokingColor(255, 255, 255); // gray background
			// Draw a white filled rectangle
			drawRect(contentStream, Color.WHITE, new java.awt.Rectangle((int) startX, (int) startY - 3, (int) stringWidth + 2, (int) stringHeight), true);
			writeText(contentStream, new Color(4, 44, 86), font, fontSize, rotate, startX, startY, footer);
		}
	}
}

Source File: WaterMarkConverter.java From workable-converter with GNU General Public License v3.0

5 votes

@Override
public boolean byBase64(ConvertRequire require) throws ConvertFailedException {
    try {
        String tmpFileName = this.saveTmpFileByBase64(require.getSrcBase64(), "pdf");
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        PDDocument pdfFile = PDDocument.load(new File(tmpFileName));
        HashMap<Integer, String> overlayGuide = new HashMap<>();
        String tmpName = this.getTmpName(require.getWaterMarkRequire());
        //0 means add watermark in all page
        if (require.getWaterMarkRequire().getWaterMarkPage() == 0) {
            for (int i = 0; i < pdfFile.getNumberOfPages(); i++) {
                overlayGuide.put(i + 1, tmpName);
            }
        } else {
            overlayGuide.put(require.getWaterMarkRequire().getWaterMarkPage(), tmpName);
        }
        Overlay overlay = new Overlay();
        overlay.setInputPDF(pdfFile);
        overlay.setOverlayPosition(Overlay.Position.BACKGROUND);
        overlay.overlay(overlayGuide);
        pdfFile.save(byteArrayOutputStream);
        require.setDestBase64(Base64FileTool.ByteArrayToBase64(byteArrayOutputStream.toByteArray()));
    } catch (IOException e) {
        throw new ConvertFailedException(e.getMessage());
    }
    return true;
}

Source File: PDF2ImageExample.java From tutorials with MIT License

5 votes

private static void generateImageFromPDF(String filename, String extension) throws IOException {
	PDDocument document = PDDocument.load(new File(filename));
	PDFRenderer pdfRenderer = new PDFRenderer(document);
	for (int page = 0; page < document.getNumberOfPages(); ++page) {
		BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
		ImageIOUtil.writeImage(bim, String.format("src/output/pdf-%d.%s", page + 1, extension), 300);
	}
	document.close();
}

Source File: WaterMarkConverter.java From workable-converter with GNU General Public License v3.0

5 votes

@Override
public boolean byFilePath(ConvertRequire require) throws ConvertFailedException {
    try {
        PDDocument pdfFile = PDDocument.load(new File(require.getWaitingFilePath()));
        HashMap<Integer, String> overlayGuide = new HashMap<>();

        String tmpName = this.getTmpName(require.getWaterMarkRequire());

        //0 means add watermark in all page
        if (require.getWaterMarkRequire().getWaterMarkPage() == 0) {
            for (int i = 0; i < pdfFile.getNumberOfPages(); i++) {
                overlayGuide.put(i + 1, tmpName);
            }
        } else {
            overlayGuide.put(require.getWaterMarkRequire().getWaterMarkPage(), tmpName);
        }
        Overlay overlay = new Overlay();
        overlay.setInputPDF(pdfFile);
        overlay.setOverlayPosition(Overlay.Position.BACKGROUND);
        overlay.overlay(overlayGuide);
        pdfFile.save(require.getResultFilePath());
    } catch ( IOException e) {
        throw new ConvertFailedException(e.getMessage());
    }

    return true;
}

Source File: ConvertToImages.java From blog-codes with Apache License 2.0

4 votes

public static void main(String[] args) throws InvalidPasswordException, IOException {
	System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider"); 

	String path="/home/lili/下载/books/汪曾祺全集1.pdf";
	String dir="/home/lili/data/wang/book1";
	new File(dir).mkdirs();
	PDDocument document = PDDocument.load(new File(path));
	PDFRenderer pdfRenderer = new PDFRenderer(document);
	for (int page = 0; page < document.getNumberOfPages(); ++page) {
		BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
		ImageIOUtil.writeImage(bim, dir + "/" + (page + 1) + ".png", 300);
	}
	document.close();
}

Source File: TxtCreator.java From pdf-converter with Apache License 2.0

4 votes

public void process(File pdf, File output){
    PDDocument pdDoc;
    try {//Kudos for closing: http://stackoverflow.com/questions/156508/closing-a-java-fileinputstream
        File tmpfile = File.createTempFile(String.format("txttmp-%s", UUID.randomUUID().toString()), null);
        RandomAccessFile raf = new RandomAccessFile(tmpfile, "rw");
        pdDoc = PDDocument.loadNonSeq(pdf, raf);
        FileWriter writer = new FileWriter(output);
        try {
            PDFTextStripper stripper = new PDFTextStripper();
            int numberOfPages = pdDoc.getNumberOfPages();

            for (int j = 1; j < numberOfPages+1; j++) {
                stripper.setStartPage(j);
                stripper.setEndPage(j);
                writer.write(stripper.getText(pdDoc));
                writer.flush();
            }
        } finally {
            pdDoc.close();
            raf.close();
            tmpfile.delete();
            writer.close();
        }
    } catch (IOException ioe) {
        log.warn(String.format("Failed to create txt for file: %s", pdf.getName()), ioe);
    }
}

Source File: PdfBoxUtilities.java From tess4j with Apache License 2.0

4 votes

/**
 * Converts PDF to PNG format.
 *
 * @param inputPdfFile input file
 * @return an array of PNG images
 * @throws java.io.IOException
 */
public static File[] convertPdf2Png(File inputPdfFile) throws IOException {
    Path path = Files.createTempDirectory("tessimages");
    File imageDir = path.toFile();

    PDDocument document = null;
    try {
        document = PDDocument.load(inputPdfFile);
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        for (int page = 0; page < document.getNumberOfPages(); ++page) {
            BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);

            // suffix in filename will be used as the file format
            String filename = String.format("workingimage%04d.png", page + 1);
            ImageIOUtil.writeImage(bim, new File(imageDir, filename).getAbsolutePath(), 300);
        }
    } catch (IOException ioe) {
        logger.error("Error extracting PDF Document => " + ioe);
    } finally {
        if (imageDir.list().length == 0) {
            imageDir.delete();
        }

        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
            }
        }
    }

    // find working files
    File[] workingFiles = imageDir.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.toLowerCase().matches("workingimage\\d{4}\\.png$");
        }
    });

    Arrays.sort(workingFiles, new Comparator<File>() {
        @Override
        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
        }
    });

    return workingFiles;
}

Source File: PageExtractor.java From gcs with Mozilla Public License 2.0

4 votes

/** 
 * Creates a new instance of PageExtractor
 * @param sourceDocument The document to split.
 */
public PageExtractor(PDDocument sourceDocument)
{
    this.sourceDocument = sourceDocument;
    endPage = sourceDocument.getNumberOfPages();
}

Source File: ExtractText.java From testarea-pdfbox2 with Apache License 2.0

4 votes

/**
 * @see #testUiPathTutorial()
 * @author Venkatachalam Neelakantan
 */
public String getTextUsingPositionsUsingPdf(String pdfLocation, int pageNumber, double x, double y, double width,
        double height) throws IOException {
    String extractedText = "";
    // PDDocument Creates an empty PDF document. You need to add at least
    // one page for the document to be valid.
    // Using load method we can load a PDF document
    PDDocument document = null;
    PDPage page = null;
    try {
        if (pdfLocation.endsWith(".pdf")) {
            document = Loader.loadPDF(new File(pdfLocation));
            int getDocumentPageCount = document.getNumberOfPages();
            System.out.println(getDocumentPageCount);

            // Get specific page. THe parameter is pageindex which starts with // 0. If we need to
            // access the first page then // the pageIdex is 0 PDPage
            if (getDocumentPageCount > 0) {
                page = document.getPage(pageNumber + 1);
            } else if (getDocumentPageCount == 0) {
                page = document.getPage(0);
            }
            // To create a rectangle by passing the x axis, y axis, width and height 
            Rectangle2D rect = new Rectangle2D.Double(x, y, width, height);
            String regionName = "region1";

            // Strip the text from PDF using PDFTextStripper Area with the
            // help of Rectangle and named need to given for the rectangle
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            stripper.addRegion(regionName, rect);
            stripper.extractRegions(page);
            System.out.println("Region is " + stripper.getTextForRegion("region1"));
            extractedText = stripper.getTextForRegion("region1");
        } else {
            System.out.println("No data return");
        }
    } catch (IOException e) {
        System.out.println("The file  not found" + "");
    } finally {
        document.close();
    }
    // Return the extracted text and this can be used for assertion
    return extractedText;
}

Java Code Examples for org.apache.pdfbox.pdmodel.PDDocument#getNumberOfPages()