Java Code Examples for org.apache.pdfbox.pdmodel.PDDocument#close()

The following examples show how to use org.apache.pdfbox.pdmodel.PDDocument#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TitleBlockWriterTest.java    From eplmp with Eclipse Public License 1.0 6 votes vote down vote up
@Test
public void createTitleBlockForPartIterationTest() throws Exception {
    PartTitleBlockData partTitleBlockData = new PartTitleBlockData(partIteration, new Locale("en"));
    byte[] titleBlock = new TitleBlockWriter(partTitleBlockData).createTitleBlock();
    PDDocument loadedDocument = PDDocument.load(titleBlock);

    Assert.assertNotNull(loadedDocument);
    String text = new PDFTextStripper().getText(loadedDocument);

    loadedDocument.close();

    Assert.assertFalse(text.isEmpty());
    Assert.assertTrue(text.contains(user.getLogin()));
    Assert.assertTrue(text.contains(partIteration.getNumber()));
    Assert.assertTrue(text.contains(partIteration.getPartRevision().getDescription()));

}
 
Example 2
Source File: ColorsProcessor.java    From asciidoctorj with Apache License 2.0 6 votes vote down vote up
/**
 * Parses a document extracting the colors for the specified words in
 * the constructor
 *
 * @param filename PDF document path
 */
public void parse (String filename) throws IOException {
    PDDocument document = null;
    try {
        document = PDDocument.load(filename, false);
        List allPages = document.getDocumentCatalog().getAllPages();
        for( int i=0; i<allPages.size(); i++ ) {
            PDPage page = (PDPage)allPages.get( i );
            PDStream contents = page.getContents();
            if (contents != null) {
                processStream( page, page.getResources(),
                    page.getContents().getStream() );
            }
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}
 
Example 3
Source File: ImageProcessor.java    From asciidoctorj-pdf with Apache License 2.0 6 votes vote down vote up
/**
 * Parses a document extracting the images
 *
 * @param filename PDF document path
 */
public void parse(String filename) throws IOException {
    PDDocument document = null;
    try {
        document = PDDocument.load(filename, false);
        List allPages = document.getDocumentCatalog().getAllPages();
        for( int i=0; i<allPages.size(); i++ ) {
            PDPage page = (PDPage)allPages.get( i );
            currentPage = i;
            processStream( page, page.findResources(), page.getContents().getStream() );
        }
    } finally {
        if (document != null) {
            document.close();
        }
    }
}
 
Example 4
Source File: PdfBoxUtilities.java    From tess4j with Apache License 2.0 6 votes vote down vote up
/**
 * Gets PDF Page Count.
 *
 * @param inputPdfFile input file
 * @return number of pages
 */
public static int getPdfPageCount(File inputPdfFile) {
    PDDocument document = null;
    try {
        document = PDDocument.load(inputPdfFile);
        return document.getNumberOfPages();
    } catch (IOException ioe) {
        logger.error("Error counting PDF pages => " + ioe);
        return - 1;
    } finally {
        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
            }
        }
    }
}
 
Example 5
Source File: FlattenAndMerge.java    From testarea-pdfbox2 with Apache License 2.0 6 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/47140209/pdfbox-files-are-sharing-common-cosstream-after-flatten">
 * PDFBox files are sharing common COSStream after flatten
 * </a>
 * <br/>
 * <a href="https://studentloans.gov/myDirectLoan/downloadForm.action?searchType=library&shortName=general&localeCode=en-us">
 * GeneralForbearance.pdf
 * </a>
 * <p>
 * Indeed, flattening, merging, and early closing of source documents
 * do not mingle well.
 * </p>
 */
@Test
public void testMergeGovernmentForms() throws IOException {
    try (   InputStream resource1 = getClass().getResourceAsStream("GeneralForbearance.pdf");
            InputStream resource2 = getClass().getResourceAsStream("GeneralForbearance.pdf")) {
        PDDocument destination = Loader.loadPDF(resource1);

        PDDocument source = Loader.loadPDF(resource2);
        source.getDocumentCatalog().getAcroForm().flatten(); //comment out just this line and the destination.save will pass

        PDFMergerUtility appender = new PDFMergerUtility();

        appender.appendDocument(destination, source);

        source.close(); //comment out just this line and the destination.save will pass

        destination.save(new File(RESULT_FOLDER, "PrintMergeIssue.pdf"));
        destination.close();
    }
}
 
Example 6
Source File: ImageExtractor.java    From inception with Apache License 2.0 6 votes vote down vote up
static void processFile(File inFile, int dpi, String outDir) throws IOException
{
    PDDocument doc = PDDocument.load(inFile);
    String baseName = inFile.getName().substring(0, inFile.getName().lastIndexOf("."));
    try {
        RegionExtractor regionExt = new RegionExtractor(doc, dpi);
        int count = 1;
        for (int pageIndex = 0; pageIndex < doc.getNumberOfPages(); pageIndex++) {
            for (ImageOperator op : ImageExtractor.extract(doc.getPage(pageIndex))) {
                RenderedImage image = regionExt.extract(pageIndex, op.x, op.y, op.w, op.h);
                String outFileName = baseName + "_" + String.valueOf(count) + ".png";
                ImageIO.write(image, "png", new File(outDir, outFileName));
                System.out.println(outFileName + " is saved.");
                count++;
            }
        }
    }
    finally {
        doc.close();
    }
}
 
Example 7
Source File: FillInForm.java    From testarea-pdfbox2 with Apache License 2.0 6 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/52059931/pdfbox-setvalue-for-multiple-pdtextfield">
 * PDFBox setValue for multiple PDTextField
 * </a>
 * <br/>
 * <a href="https://ufile.io/z8jzj">
 * testform.pdf
 * </a>
 * <p>
 * Cannot reproduce the issue.
 * </p>
 */
@Test
public void testFillLikeJuvi() throws IOException {
    try (   InputStream originalStream = getClass().getResourceAsStream("testform.pdf") ) {
        PDDocument document = Loader.loadPDF(originalStream);
        PDDocumentCatalog docCatalog = document.getDocumentCatalog();
        PDAcroForm acroForm = docCatalog.getAcroForm();

        PDTextField field = (PDTextField) acroForm.getField("Check1");
        field.setValue("1111");

        PDTextField field2 = (PDTextField) acroForm.getField("Check2");
        field2.setValue("2222");

        PDTextField field3 = (PDTextField) acroForm.getField("HelloWorld");
        field3.setValue("HelloWorld");

        document.save(new File(RESULT_FOLDER, "testform-filled.pdf"));
        document.close();
    }
}
 
Example 8
Source File: PdfUtils.java    From job with MIT License 5 votes vote down vote up
public static String parsePdf2Text(InputStream input) throws Exception {
  PDDocument doc = PDDocument.load(input);
  ByteArrayOutputStream output = new ByteArrayOutputStream();
  OutputStreamWriter writer = new OutputStreamWriter(output);
  try {
    PDFTextStripper stripper = new PDFTextStripper();
    stripper.writeText(doc, writer);
  } finally {
    doc.close();
    input.close();
    output.close();
    writer.close();
  }
  return new String(output.toByteArray());
}
 
Example 9
Source File: FillInForm.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/39720305/ufffd-is-not-available-in-this-fonts-encoding-winansiencoding">
 * U+FFFD is not available in this font's encoding: WinAnsiEncoding
 * </a>
 * <p>
 * The issue cannot be reproduced.
 * </p>
 */
@Test
public void testFillLikeStDdt() throws IOException
{
    try (   InputStream originalStream = getClass().getResourceAsStream("FillFormField.pdf") )
    {
        PDDocument pdfDocument = Loader.loadPDF(originalStream);
        PDAcroForm acroForm = pdfDocument.getDocumentCatalog().getAcroForm();

        if (acroForm != null)
        {
            List<PDField> fields = acroForm.getFields();
            for (PDField field : fields) {
                switch (field.getPartialName()) {
                    case "Title" /*"devices"*/:
                        field.setValue("Ger�t");
                        field.setReadOnly(true);
                        break;
                }
            }
            acroForm.flatten(fields, true);
        }

        pdfDocument.save(new File(RESULT_FOLDER, "FillFormFieldStDdt.pdf"));
        pdfDocument.close();
    }
}
 
Example 10
Source File: PdfTools.java    From MyBox with Apache License 2.0 5 votes vote down vote up
public static boolean createPdfFile(File file, String author) {
    try {
        PDDocument targetDoc = createPDF(file, author);
        if (targetDoc != null) {
            targetDoc.close();
        }
        return true;
    } catch (Exception e) {
        logger.error(e.toString());
        return false;
    }
}
 
Example 11
Source File: AddImage.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/50988007/clip-an-image-with-pdfbox">
 * Clip an image with PDFBOX
 * </a>
 * <p>
 * This test demonstrates how to clip an image and frame the clipping area.
 * </p>
 */
@SuppressWarnings("deprecation")
@Test
public void testImageAddClipped() throws IOException {
    try (   InputStream imageResource = getClass().getResourceAsStream("Willi-1.jpg")   )
    {
        PDDocument doc = new PDDocument();
        PDImageXObject pdImage = PDImageXObject.createFromByteArray(doc, ByteStreams.toByteArray(imageResource), "Willi");

        int w = pdImage.getWidth();
        int h = pdImage.getHeight();

        PDPage page = new PDPage();
        doc.addPage(page);
        PDRectangle cropBox = page.getCropBox();
        PDPageContentStream contentStream = new PDPageContentStream(doc, page);

        contentStream.setStrokingColor(25, 200, 25);
        contentStream.setLineWidth(4);
        contentStream.moveTo(cropBox.getLowerLeftX(), cropBox.getLowerLeftY() + h/2);
        contentStream.lineTo(cropBox.getLowerLeftX() + w/3, cropBox.getLowerLeftY() + 2*h/3);
        contentStream.lineTo(cropBox.getLowerLeftX() + w, cropBox.getLowerLeftY() + h/2);
        contentStream.lineTo(cropBox.getLowerLeftX() + w/3, cropBox.getLowerLeftY() + h/3);
        contentStream.closePath();
        //contentStream.clip();
        contentStream.appendRawCommands("W ");
        contentStream.stroke();

        contentStream.drawImage(pdImage, cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), w, h);

        contentStream.close();

        doc.save(new File(RESULT_FOLDER, "image-clipped.pdf"));
        doc.close();
    }
}
 
Example 12
Source File: FillInForm.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/56938135/pdfbox-inconsistent-pdtextfield-autosize-behavior-after-setvalue">
 * PDFBox Inconsistent PDTextField Autosize Behavior after setValue
 * </a>
 * <br/>
 * <a href="http://www.filedropper.com/0postfontload">
 * 0.pdf
 * </a>
 * <p>
 * By resetting the MultiLine flags, too, one eventually gets rid
 * of the problem of the lower part of the field value being cut
 * off in the Care Providers Address fields. This actually should
 * be considered an issue of PDFBox, though, not of the source PDF
 * here.
 * </p>
 * @see #testFill0LikeXenyal()
 * @see #testFill0DropOldAppearance()
 * @see #testFill0DropOldAppearanceNoCombNoMax()
 */
@Test
public void testFill0DropOldAppearanceNoCombNoMaxNoMultiLine() throws IOException {
    final int FLAG_MULTILINE = 1 << 12;
    final int FLAG_COMB = 1 << 24;

    try (   InputStream originalStream = getClass().getResourceAsStream("0.pdf");
            InputStream fontStream = getClass().getResourceAsStream("Lato-Regular.ttf"))
    {
        PDDocument doc = Loader.loadPDF(originalStream);
        PDAcroForm acroForm = doc.getDocumentCatalog().getAcroForm();

        PDType0Font font = PDType0Font.load(doc, fontStream, false);
        String font_name = acroForm.getDefaultResources().add(font).getName();

        for (PDField field : acroForm.getFieldTree()) {
            if (field instanceof PDTextField) {
                PDTextField textField = (PDTextField) field;
                textField.getCOSObject().removeItem(COSName.MAX_LEN);
                textField.getCOSObject().setFlag(COSName.FF, FLAG_COMB | FLAG_MULTILINE, false);;
                textField.setDefaultAppearance(String.format("/%s 0 Tf 0 g", font_name));
                textField.getWidgets().forEach(w -> w.getAppearance().setNormalAppearance((PDAppearanceEntry)null));
                textField.setValue("Test");
            }
        }
        

        doc.save(new File(RESULT_FOLDER, "0-filledDropOldAppearanceNoCombNoMaxNoMultiLine.pdf"));
        doc.close();
    }        
}
 
Example 13
Source File: TestPdfFontExtractor.java    From FontVerter with GNU Lesser General Public License v3.0 5 votes vote down vote up
@Test
public void givenPdfWith2Fonts_extractFontsToDir_thenDirectoryHasThreeTtfFiles() throws IOException {
    PDDocument doc = PDDocument.load(TestUtils.readTestFile("pdf/brno30.pdf"));
    PdfFontExtractor extractor = new PdfFontExtractor();

    File extractDir = folder.getRoot();
    extractor.extractFontsToDir(doc, extractDir);
    File[] fontFiles = extractDir.listFiles();

    Assert.assertEquals(3, fontFiles.length);
    for (File fileOn : fontFiles)
        Assert.assertEquals("ttf", FilenameUtils.getExtension(fileOn.getPath()));

    doc.close();
}
 
Example 14
Source File: AddImage.java    From testarea-pdfbox2 with Apache License 2.0 5 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/49958604/draw-image-at-mid-position-using-pdfbox-java">
 * Draw image at mid position using pdfbox Java
 * </a>
 * <p>
 * This is a fixed version of the the OP's original code, cf.
 * {@link #testImageAppendLikeShanky()}. It does not mirrors the image.
 * </p>
 */
@Test
public void testImageAppendNoMirror() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("/mkl/testarea/pdfbox2/sign/test.pdf");
            InputStream imageResource = getClass().getResourceAsStream("Willi-1.jpg")   )
    {
        PDDocument doc = Loader.loadPDF(resource);
        PDImageXObject pdImage = PDImageXObject.createFromByteArray(doc, ByteStreams.toByteArray(imageResource), "Willi");

        int w = pdImage.getWidth();
        int h = pdImage.getHeight();

        PDPage page = doc.getPage(0);
        PDPageContentStream contentStream = new PDPageContentStream(doc, page, PDPageContentStream.AppendMode.APPEND, true);

        float x_pos = page.getCropBox().getWidth();
        float y_pos = page.getCropBox().getHeight();

        float x_adjusted = ( x_pos - w ) / 2 + page.getCropBox().getLowerLeftX();
        float y_adjusted = ( y_pos - h ) / 2 + page.getCropBox().getLowerLeftY();

        contentStream.drawImage(pdImage, x_adjusted, y_adjusted, w, h);
        contentStream.close();

        doc.save(new File(RESULT_FOLDER, "test-with-image-no-mirror.pdf"));
        doc.close();

    }
}
 
Example 15
Source File: PdfExtractionResource.java    From quarkus-pdf-extract with Apache License 2.0 5 votes vote down vote up
static String getText(File pdfFile) throws IOException {
    PDDocument doc = null;
    try {
        doc = PDDocument.load(pdfFile);
        return new PDFLayoutTextStripper().getText(doc);
    }
    finally {
        if (doc != null) {
            doc.close();
        }
    }
}
 
Example 16
Source File: ReadXfaForm.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
public static byte[] getParsableXFAForm(InputStream file)
{
    if (file == null)
        return null;
    PDDocument doc;
    PDDocumentCatalog catalog;
    PDAcroForm acroForm;

    PDXFAResource xfa;
    try
    {
        // String pass = null;
        doc = Loader.loadPDF(file);
        if (doc == null)
            return null;
        // flattenPDF(doc);
        doc.setAllSecurityToBeRemoved(true);
        // System.out.println("Security " + doc.isAllSecurityToBeRemoved());
        catalog = doc.getDocumentCatalog();
        if (catalog == null)
        {
            doc.close();
            return null;
        }
        acroForm = catalog.getAcroForm();
        if (acroForm == null)
        {
            doc.close();
            return null;
        }
        xfa = acroForm.getXFA();
        if (xfa == null)
        {
            doc.close();
            return null;
        }
        // TODO return byte[]
        byte[] xfaBytes = xfa.getBytes();
        doc.close();
        return xfaBytes;
    } catch (IOException e)
    {
        // handle IOException
        // happens when the file is corrupt.
        e.printStackTrace();
        System.out.println("XFAUtils-getParsableXFAForm-IOException");
        return null;
    }
}
 
Example 17
Source File: RectanglesOverText.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
 * <a href="https://stackoverflow.com/questions/46080131/text-coordinates-when-stripping-from-pdfbox">
 * Text coordinates when stripping from PDFBox
 * </a>
 * <br/>
 * <a href="https://download-a.akamaihd.net/files/media_mwb/b7/mwb_I_201711.pdf">
 * mwb_I_201711.pdf
 * </a>
 * <p>
 * This test applies the OP's code to his example PDF file and indeed, there is an offset!
 * This is due to the <code>LegacyPDFStreamEngine</code> method <code>showGlyph</code>
 * which manipulates the text rendering matrix to make the lower left corner of the
 * crop box the origin. In the current version of this test, that offset is corrected,
 * see below. 
 * </p>
 */
@Test
public void testCoverTextByRectanglesMwbI201711() throws IOException {
    try (   InputStream resource = getClass().getResourceAsStream("mwb_I_201711.pdf")  ) {
        PDDocument doc = Loader.loadPDF(resource);

        myStripper stripper = new myStripper();

        stripper.setStartPage(1); // fix it to first page just to test it
        stripper.setEndPage(1);
        stripper.getText(doc);

        TextLine line = stripper.lines.get(1); // the line i want to paint on

        float minx = -1;
        float maxx = -1;

        for (TextPosition pos: line.textPositions)
        {
            if (pos == null)
                continue;

            if (minx == -1 || pos.getTextMatrix().getTranslateX() < minx) {
                minx = pos.getTextMatrix().getTranslateX();
            }
            if (maxx == -1 || pos.getTextMatrix().getTranslateX() > maxx) {
                maxx = pos.getTextMatrix().getTranslateX();
            }
        }

        TextPosition firstPosition = line.textPositions.get(0);
        TextPosition lastPosition = line.textPositions.get(line.textPositions.size() - 1);

        // corrected x and y
        PDRectangle cropBox = doc.getPage(0).getCropBox();

        float x = minx + cropBox.getLowerLeftX();
        float y = firstPosition.getTextMatrix().getTranslateY() + cropBox.getLowerLeftY();
        float w = (maxx - minx) + lastPosition.getWidth();
        float h = lastPosition.getHeightDir();

        PDPageContentStream contentStream = new PDPageContentStream(doc, doc.getPage(0), PDPageContentStream.AppendMode.APPEND, false, true);

        contentStream.setNonStrokingColor(Color.RED);
        contentStream.addRect(x, y, w, h);
        contentStream.fill();
        contentStream.close();

        File fileout = new File(RESULT_FOLDER, "mwb_I_201711-withRectangles.pdf");
        doc.save(fileout);
        doc.close();
    }
}
 
Example 18
Source File: TestEmptySignatureField.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/37601092/pdfbox-identify-specific-pages-and-functionalities-recommendations">
 * PDFBox identify specific pages and functionalities recommendations
 * </a>
 * 
 * <p>
 * This test shows how to add an empty signature field with a custom appearance
 * to an existing PDF.
 * </p>
 */
@Test
public void testAddEmptySignatureField() throws IOException
{
    try (   InputStream sourceStream = getClass().getResourceAsStream("test.pdf");
            OutputStream output = new FileOutputStream(new File(RESULT_FOLDER, "test-with-empty-sig-field.pdf")))
    {
        PDFont font = PDType1Font.HELVETICA;
        PDResources resources = new PDResources();
        resources.put(COSName.getPDFName("Helv"), font);

        PDDocument document = Loader.loadPDF(sourceStream);
        PDAcroForm acroForm = new PDAcroForm(document);
        acroForm.setDefaultResources(resources);
        document.getDocumentCatalog().setAcroForm(acroForm);

        PDRectangle rect = new PDRectangle(50, 750, 200, 50);

        PDAppearanceDictionary appearanceDictionary = new PDAppearanceDictionary();
        PDAppearanceStream appearanceStream = new PDAppearanceStream(document);
        appearanceStream.setBBox(rect.createRetranslatedRectangle());
        appearanceStream.setResources(resources);
        appearanceDictionary.setNormalAppearance(appearanceStream);
        PDPageContentStream contentStream = new PDPageContentStream(document, appearanceStream);
        contentStream.setStrokingColor(Color.BLACK);
        contentStream.setNonStrokingColor(Color.LIGHT_GRAY);
        contentStream.setLineWidth(2);
        contentStream.addRect(0, 0, rect.getWidth(), rect.getHeight());
        contentStream.fill();
        contentStream.moveTo(1 * rect.getHeight() / 4, 1 * rect.getHeight() / 4);
        contentStream.lineTo(2 * rect.getHeight() / 4, 3 * rect.getHeight() / 4);
        contentStream.moveTo(1 * rect.getHeight() / 4, 3 * rect.getHeight() / 4);
        contentStream.lineTo(2 * rect.getHeight() / 4, 1 * rect.getHeight() / 4);
        contentStream.moveTo(3 * rect.getHeight() / 4, 1 * rect.getHeight() / 4);
        contentStream.lineTo(rect.getWidth() - rect.getHeight() / 4, 1 * rect.getHeight() / 4);
        contentStream.stroke();
        contentStream.setNonStrokingColor(Color.DARK_GRAY);
        contentStream.beginText();
        contentStream.setFont(font, rect.getHeight() / 5);
        contentStream.newLineAtOffset(3 * rect.getHeight() / 4, -font.getBoundingBox().getLowerLeftY() * rect.getHeight() / 5000);
        contentStream.showText("Customer");
        contentStream.endText();
        contentStream.close();

        PDSignatureField signatureField = new PDSignatureField(acroForm);
        signatureField.setPartialName("SignatureField");
        PDPage page = document.getPage(0);

        PDAnnotationWidget widget = signatureField.getWidgets().get(0);
        widget.setAppearance(appearanceDictionary);
        widget.setRectangle(rect);
        widget.setPage(page);

        page.getAnnotations().add(widget);
        acroForm.getFields().add(signatureField);

        document.save(output);
        document.close();
    }
}
 
Example 19
Source File: ExtractText.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
 * @see #testUiPathTutorial()
 * @author Venkatachalam Neelakantan
 */
public String getTextUsingPositionsUsingPdf(String pdfLocation, int pageNumber, double x, double y, double width,
        double height) throws IOException {
    String extractedText = "";
    // PDDocument Creates an empty PDF document. You need to add at least
    // one page for the document to be valid.
    // Using load method we can load a PDF document
    PDDocument document = null;
    PDPage page = null;
    try {
        if (pdfLocation.endsWith(".pdf")) {
            document = Loader.loadPDF(new File(pdfLocation));
            int getDocumentPageCount = document.getNumberOfPages();
            System.out.println(getDocumentPageCount);

            // Get specific page. THe parameter is pageindex which starts with // 0. If we need to
            // access the first page then // the pageIdex is 0 PDPage
            if (getDocumentPageCount > 0) {
                page = document.getPage(pageNumber + 1);
            } else if (getDocumentPageCount == 0) {
                page = document.getPage(0);
            }
            // To create a rectangle by passing the x axis, y axis, width and height 
            Rectangle2D rect = new Rectangle2D.Double(x, y, width, height);
            String regionName = "region1";

            // Strip the text from PDF using PDFTextStripper Area with the
            // help of Rectangle and named need to given for the rectangle
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            stripper.addRegion(regionName, rect);
            stripper.extractRegions(page);
            System.out.println("Region is " + stripper.getTextForRegion("region1"));
            extractedText = stripper.getTextForRegion("region1");
        } else {
            System.out.println("No data return");
        }
    } catch (IOException e) {
        System.out.println("The file  not found" + "");
    } finally {
        document.close();
    }
    // Return the extracted text and this can be used for assertion
    return extractedText;
}
 
Example 20
Source File: PdfBoxUtilities.java    From tess4j with Apache License 2.0 4 votes vote down vote up
/**
 * Converts PDF to PNG format.
 *
 * @param inputPdfFile input file
 * @return an array of PNG images
 * @throws java.io.IOException
 */
public static File[] convertPdf2Png(File inputPdfFile) throws IOException {
    Path path = Files.createTempDirectory("tessimages");
    File imageDir = path.toFile();

    PDDocument document = null;
    try {
        document = PDDocument.load(inputPdfFile);
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        for (int page = 0; page < document.getNumberOfPages(); ++page) {
            BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);

            // suffix in filename will be used as the file format
            String filename = String.format("workingimage%04d.png", page + 1);
            ImageIOUtil.writeImage(bim, new File(imageDir, filename).getAbsolutePath(), 300);
        }
    } catch (IOException ioe) {
        logger.error("Error extracting PDF Document => " + ioe);
    } finally {
        if (imageDir.list().length == 0) {
            imageDir.delete();
        }

        if (document != null) {
            try {
                document.close();
            } catch (Exception e) {
            }
        }
    }

    // find working files
    File[] workingFiles = imageDir.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.toLowerCase().matches("workingimage\\d{4}\\.png$");
        }
    });

    Arrays.sort(workingFiles, new Comparator<File>() {
        @Override
        public int compare(File f1, File f2) {
            return f1.getName().compareTo(f2.getName());
        }
    });

    return workingFiles;
}