com.itextpdf.text.pdf.parser.PdfReaderContentParser Java Examples

The following examples show how to use com.itextpdf.text.pdf.parser.PdfReaderContentParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PDF2WordExample.java    From tutorials with MIT License 6 votes vote down vote up
private static void generateDocFromPDF(String filename) throws IOException {
	XWPFDocument doc = new XWPFDocument();

	String pdf = filename;
	PdfReader reader = new PdfReader(pdf);
	PdfReaderContentParser parser = new PdfReaderContentParser(reader);

	for (int i = 1; i <= reader.getNumberOfPages(); i++) {
		TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy());
		String text = strategy.getResultantText();
		XWPFParagraph p = doc.createParagraph();
		XWPFRun run = p.createRun();
		run.setText(text);
		run.addBreak(BreakType.PAGE);
	}
	FileOutputStream out = new FileOutputStream("src/output/pdf.docx");
	doc.write(out);
	out.close();
	reader.close();
	doc.close();
}
 
Example #2
Source File: DividerAwareTextExtraction.java    From testarea-itext5 with GNU Affero General Public License v3.0 6 votes vote down vote up
String extractAndStore(PdfReader reader, String format, int from, int to) throws IOException
{
    StringBuilder builder = new StringBuilder();

    for (int page = from; page <= to; page++)
    {
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        DividerAwareTextExtrationStrategy strategy = parser.processContent(page, new DividerAwareTextExtrationStrategy(810, 30, 20, 575));

        List<Section> sections = strategy.getSections();
        int i = 0;
        for (Section section : sections)
        {
            String sectionText = strategy.getResultantText(section);
            Files.write(Paths.get(String.format(format, page, i)), sectionText.getBytes("UTF8"));

            builder.append("--\n")
                   .append(sectionText)
                   .append('\n');
            i++;
        }
        builder.append("\n\n");
    }

    return builder.toString();
}
 
Example #3
Source File: PdfDenseMergeTool.java    From testarea-itext5 with GNU Affero General Public License v3.0 6 votes vote down vote up
void merge(PdfReader reader, PdfReaderContentParser parser, int page) throws IOException
{
    TextMarginFinder finder = parser.processContent(page, new TextMarginFinder());
    Rectangle pageSizeToImport = reader.getPageSize(page);
    float heightToImport = finder.getHeight();
    float maxHeight = pageSize.getHeight() - topMargin - bottomMargin;
    if (heightToImport > maxHeight)
    {
        throw new IllegalArgumentException(String.format("Page %s content too large; height: %s, limit: %s.", page, heightToImport, maxHeight));
    }

    if (heightToImport > yPosition - pageSize.getBottom(bottomMargin))
    {
        newPage();
    }
    else if (!writer.isPageEmpty())
    {
        heightToImport += gap;
    }
    yPosition -= heightToImport;

    PdfImportedPage importedPage = writer.getImportedPage(reader, page);
    writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (finder.getLly() - pageSizeToImport.getBottom()));
}
 
Example #4
Source File: DividerAndColorAwareTextExtraction.java    From testarea-itext5 with GNU Affero General Public License v3.0 6 votes vote down vote up
String extractAndStore(PdfReader reader, String format, int from, int to, BaseColor headerColor) throws IOException {
    StringBuilder builder = new StringBuilder();

    for (int page = from; page <= to; page++) {
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        DividerAwareTextExtrationStrategy strategy = parser.processContent(page, new DividerAndColorAwareTextExtractionStrategy(810, 30, 20, 575, headerColor));

        List<Section> sections = strategy.getSections();
        int i = 0;
        for (Section section : sections) {
            String sectionText = strategy.getResultantText(section);
            Files.write(Paths.get(String.format(format, page, i)), sectionText.getBytes("UTF8"));

            builder.append("--\n")
                    .append(sectionText)
                    .append('\n');
            i++;
        }
        builder.append("\n\n");
    }

    return builder.toString();
}
 
Example #5
Source File: ExtractDrawnCheckboxes.java    From testarea-itext5 with GNU Affero General Public License v3.0 6 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/40549977/reading-legacy-word-forms-checkboxes-converted-to-pdf">
 * Reading legacy Word forms checkboxes converted to PDF
 * </a>
 * <br>
 * <a href="https://www.dropbox.com/s/4z7ky3yy2yaj53i/Doc1.pdf?dl=0">
 * Doc1.pdf
 * </a>
 * <p>
 * This test shows how one can extract the sample drawn "checkboxes" from the
 * sample PDF provided by the OP.
 * </p>
 */
@Test
public void testExtractDoc1() throws IOException
{
    try (   InputStream resource = getClass().getResourceAsStream("Doc1.pdf"))
    {
        PdfReader pdfReader = new PdfReader(resource);

        for (int page = 1; page <= pdfReader.getNumberOfPages(); page++)
        {
            System.out.printf("\nPage %s\n====\n", page);

            CheckBoxExtractionStrategy strategy = new CheckBoxExtractionStrategy();
            PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
            parser.processContent(page, strategy);

            for (Box box : strategy.getBoxes())
            {
                Vector basePoint = box.getDiagonal().getStartPoint();
                System.out.printf("at %s, %s - %s\n", basePoint.get(Vector.I1), basePoint.get(Vector.I2),
                        box.isChecked() ? "checked" : "unchecked");
            }
        }
    }
}
 
Example #6
Source File: ExtractCertifiedSchoolList.java    From testarea-itext5 with GNU Affero General Public License v3.0 6 votes vote down vote up
@Test
    public void testCertifiedSchoolList_9_16_2015() throws IOException
    {
        try (   Writer data = new OutputStreamWriter(new FileOutputStream(new File(RESULT_FOLDER, "data.txt")), "UTF-8");
                Writer nonData = new OutputStreamWriter(new FileOutputStream(new File(RESULT_FOLDER, "non-data.txt")), "UTF-8");
                InputStream resource = getClass().getResourceAsStream("certified-school-list-9-16-2015.pdf")    )
        {
            CertifiedSchoolListExtractionStrategy strategy = new CertifiedSchoolListExtractionStrategy(data, nonData);
            PdfReader reader = new PdfReader(resource);

            PdfReaderContentParser parser = new PdfReaderContentParser(reader);
            for (int page = 1; page <= reader.getNumberOfPages(); page++)
                parser.processContent(page, strategy);
//            parser.processContent(28, strategy);
            strategy.close();
        }
    }
 
Example #7
Source File: PdfVeryDenseMergeTool.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
void merge(PdfReader reader) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    for (int page = 1; page <= reader.getNumberOfPages(); page++)
    {
        merge(reader, parser, page);
    }
}
 
Example #8
Source File: FindFreeSpace.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
public Collection<Rectangle2D> findExt(PdfReader reader, float minWidth, float minHeight, int page) throws IOException
{
    Rectangle cropBox = reader.getCropBox(page);
    Rectangle2D crop = new Rectangle2D.Float(cropBox.getLeft(), cropBox.getBottom(), cropBox.getWidth(), cropBox.getHeight());
    FreeSpaceFinder finder = new FreeSpaceFinderExt(crop, minWidth, minHeight);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    parser.processContent(page, finder);
    return finder.freeSpaces;
}
 
Example #9
Source File: FindFreeSpace.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
public Collection<Rectangle2D> find(PdfReader reader, float minWidth, float minHeight, int page) throws IOException
{
    Rectangle cropBox = reader.getCropBox(page);
    Rectangle2D crop = new Rectangle2D.Float(cropBox.getLeft(), cropBox.getBottom(), cropBox.getWidth(), cropBox.getHeight());
    FreeSpaceFinder finder = new FreeSpaceFinder(crop, minWidth, minHeight);
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    parser.processContent(page, finder);
    return finder.freeSpaces;
}
 
Example #10
Source File: TextLocationExtraction.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
void mark(InputStream input, OutputStream output, Pattern pattern) throws DocumentException, IOException
{
    PdfReader reader = new PdfReader(input);
    PdfStamper stamper = new PdfStamper(reader, output);
    try {
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        for (int pageNr = 1; pageNr <= reader.getNumberOfPages(); pageNr++)
        {
            SearchTextLocationExtractionStrategy strategy = new SearchTextLocationExtractionStrategy(pattern);
            parser.processContent(pageNr, strategy, Collections.emptyMap()).getResultantText();
            Collection<TextRectangle> locations = strategy.getLocations(null);
            if (locations.isEmpty())
                continue;

            PdfContentByte canvas = stamper.getOverContent(pageNr);
            canvas.setRGBColorStroke(255, 255, 0);
            for (TextRectangle location : locations)
            {
                canvas.rectangle(location.getMinX(), location.getMinY(), location.getWidth(), location.getHeight());
            }
            canvas.stroke();
        }
        stamper.close();
    } finally {
        reader.close();
    }
}
 
Example #11
Source File: PageVerticalAnalysis.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
void analyzeVertically(InputStream pdf, File target) throws IOException
{
    final PdfReader reader = new PdfReader(pdf);

    try 
    {
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuilder builder = new StringBuilder();
        for (int page=1; page <= reader.getNumberOfPages(); page++)
        {
            PageVerticalAnalyzer analyzer = parser.processContent(page, new PageVerticalAnalyzer());
            builder.append("Page ").append(page).append('\n');
            if (analyzer.verticalFlips.size() > 0)
            {
                for (int i = 0; i < analyzer.verticalFlips.size() - 1; i+=2)
                {
                    builder.append(String.format("%3.3f - %3.3f\n", analyzer.verticalFlips.get(i), analyzer.verticalFlips.get(i+1)));
                }
                builder.append('\n');
            }
            else
            {
                builder.append("No content\n\n");
            }
        }
        String sections = builder.toString();
        System.out.print(sections);
        Files.write(target.toPath(), sections.getBytes());
    }
    finally
    {
        reader.close();
    }
}
 
Example #12
Source File: PdfDenseMergeTool.java    From testarea-itext5 with GNU Affero General Public License v3.0 5 votes vote down vote up
void merge(PdfReader reader) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    for (int page = 1; page <= reader.getNumberOfPages(); page++)
    {
        merge(reader, parser, page);
    }
}
 
Example #13
Source File: OfficeUtils.java    From dk-fitting with Apache License 2.0 5 votes vote down vote up
public static String itextPdf2Txt(String filePath) throws Exception {
        PdfReader reader = new PdfReader(filePath);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        StringBuffer buff = new StringBuffer();
        TextExtractionStrategy strategy;
        for (int i = 1; i <= reader.getNumberOfPages(); i++) {
             strategy = parser.processContent(i,
                    new SimpleTextExtractionStrategy());
             buff.append(strategy.getResultantText());
          }
//        String res = new String(buff.toString().getBytes("utf-8"), "utf-8");
        return buff.toString();
    }
 
Example #14
Source File: ExtractSuperAndSubInLine.java    From testarea-itext5 with GNU Affero General Public License v3.0 4 votes vote down vote up
void markLineBoundaries(String resource, int startPage, int endPage) throws IOException, DocumentException
{
    String name = new File(resource).getName();
    String target = String.format("%s-lines-%s-%s.pdf", name, startPage, endPage);
    InputStream resourceStream = getClass().getResourceAsStream(resource);
    try
    {
        PdfReader reader = new PdfReader(resourceStream);
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        System.out.printf("\nLine boundaries in %s\n", name);

        PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(new File(RESULT_FOLDER, target)));
        
        for (int page = startPage; page < endPage; page++)
        {
            System.out.printf("\n   Page %s\n   ", page);
            
            TextLineFinder finder = new TextLineFinder();
            parser.processContent(page, finder);

            PdfContentByte over = stamper.getOverContent(page);
            Rectangle mediaBox = reader.getPageSize(page);
            
            for (float flip: finder.verticalFlips)
            {
                System.out.printf(" %s", flip);
                over.moveTo(mediaBox.getLeft(), flip);
                over.lineTo(mediaBox.getRight(), flip);
            }

            System.out.println();
            over.stroke();
        }

        stamper.close();
    }
    finally
    {
        if (resourceStream != null)
            resourceStream.close();
    }
}
 
Example #15
Source File: PdfVeryDenseMergeTool.java    From testarea-itext5 with GNU Affero General Public License v3.0 4 votes vote down vote up
void merge(PdfReader reader, PdfReaderContentParser parser, int page) throws IOException
{
    PdfImportedPage importedPage = writer.getImportedPage(reader, page);
    PdfContentByte directContent = writer.getDirectContent();
    
    PageVerticalAnalyzer finder = parser.processContent(page, new PageVerticalAnalyzer());
    if (finder.verticalFlips.size() < 2)
        return;
    Rectangle pageSizeToImport = reader.getPageSize(page);

    int startFlip = finder.verticalFlips.size() - 1;
    boolean first = true;
    while (startFlip > 0)
    {
        if (!first)
            newPage();

        float freeSpace = yPosition - pageSize.getBottom(bottomMargin);
        int endFlip = startFlip + 1;
        while ((endFlip > 1) && (finder.verticalFlips.get(startFlip) - finder.verticalFlips.get(endFlip - 2) < freeSpace))
            endFlip -=2;
        if (endFlip < startFlip)
        {
            float height = finder.verticalFlips.get(startFlip) - finder.verticalFlips.get(endFlip);

            directContent.saveState();
            directContent.rectangle(0, yPosition - height, pageSizeToImport.getWidth(), height);
            directContent.clip();
            directContent.newPath();

            writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (finder.verticalFlips.get(startFlip) - pageSizeToImport.getBottom()));

            directContent.restoreState();
            yPosition -= height + gap;
            startFlip = endFlip - 1;
        }
        else if (!first) 
            throw new IllegalArgumentException(String.format("Page %s content sections too large.", page));
        first = false;
    }
}
 
Example #16
Source File: ImportPageWithoutFreeSpace.java    From testarea-itext5 with GNU Affero General Public License v3.0 3 votes vote down vote up
/**
 * <p>
 * This method restricts the media boxes of the pages in the given {@link PdfReader}
 * to the actual content found by the {@link MarginFinder} extended render listener.
 * </p>
 * <p>
 * It essentially is copied from the {@link TestTrimPdfPage} methods
 * {@link TestTrimPdfPage#testWithStamperExtFinder()} and
 * {@link TestTrimPdfPage#getOutputPageSize4(Rectangle, PdfReader, int)}.
 * In contrast to the code there this method manipulates
 * the media box because this is the only box respected by
 * {@link PdfWriter#getImportedPage(PdfReader, int)}.
 * </p>
 */
static void cropPdf(PdfReader reader) throws IOException
{
    int n = reader.getNumberOfPages();
    for (int i = 1; i <= n; i++)
    {
        PdfReaderContentParser parser = new PdfReaderContentParser(reader);
        MarginFinder finder = parser.processContent(i, new MarginFinder());
        Rectangle rect = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());

        PdfDictionary page = reader.getPageN(i);
        page.put(PdfName.MEDIABOX, new PdfArray(new float[]{rect.getLeft(), rect.getBottom(), rect.getRight(), rect.getTop()}));
    }
}
 
Example #17
Source File: TestTrimPdfPage.java    From testarea-itext5 with GNU Affero General Public License v3.0 3 votes vote down vote up
/**
 * Need to get the size of the page excluding whitespace......
 * <p>
 * The OP's code revised to use MarginFinder
 * 
 * @param pageSize the original page size
 * @param reader the pdf reader
 * @return a new page size which cuts out the whitespace
 * @throws IOException 
 */
private Rectangle getOutputPageSize4(Rectangle pageSize, PdfReader reader, int page) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    MarginFinder finder = parser.processContent(page, new MarginFinder());
    Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    return result;
}
 
Example #18
Source File: TestTrimPdfPage.java    From testarea-itext5 with GNU Affero General Public License v3.0 3 votes vote down vote up
/**
 * Need to get the size of the page excluding whitespace......
 * <p>
 * The OP's code revised to use a width with equal margins left and right
 * 
 * @param pageSize the original page size
 * @param reader the pdf reader
 * @return a new page size which cuts out the whitespace
 * @throws IOException 
 */
private Rectangle getOutputPageSize3(Rectangle pageSize, PdfReader reader, int page) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextMarginFinder finder = parser.processContent(page, new TextMarginFinder());
    float right = 2 * finder.getUrx() - finder.getLlx();
    Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), right, finder.getUry());
    System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    return result;
}
 
Example #19
Source File: TestTrimPdfPage.java    From testarea-itext5 with GNU Affero General Public License v3.0 3 votes vote down vote up
/**
 * Need to get the size of the page excluding whitespace......
 * <p>
 * The OP's code revised to use the whole page width
 * 
 * @param pageSize the original page size
 * @param reader the pdf reader
 * @return a new page size which cuts out the whitespace
 * @throws IOException 
 */
private Rectangle getOutputPageSize2(Rectangle pageSize, PdfReader reader, int page) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextMarginFinder finder = parser.processContent(page, new TextMarginFinder());
    Rectangle result = new Rectangle(pageSize.getLeft(), finder.getLly(), pageSize.getRight(), finder.getUry());
    System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    return result;
}
 
Example #20
Source File: TestTrimPdfPage.java    From testarea-itext5 with GNU Affero General Public License v3.0 3 votes vote down vote up
/**
 * Need to get the size of the page excluding whitespace......
 * <p>
 * The OP's code
 * 
 * @param pageSize the original page size
 * @param reader the pdf reader
 * @return a new page size which cuts out the whitespace
 * @throws IOException 
 */
private Rectangle getOutputPageSize(Rectangle pageSize, PdfReader reader, int page) throws IOException
{
    PdfReaderContentParser parser = new PdfReaderContentParser(reader);
    TextMarginFinder finder = parser.processContent(page, new TextMarginFinder());
    Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry());
    return result;
}