Java Code Examples for org.apache.pdfbox.pdmodel.PDPage.getContents()

The following are Jave code examples for showing how to use getContents() of the org.apache.pdfbox.pdmodel.PDPage class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: memoria-politica   File: FedDepPhotosUtility.java   Source Code and License Vote up 6 votes
private static void buildImgKeyToPolNameMapping(PDPage page) throws IOException {
    PDStream contents = page.getContents();
    PDFStreamParser parser = new PDFStreamParser(contents.getStream());
    parser.parse();
    List tokens = parser.getTokens();

    boolean concatStringPhase = false;
    String polName = "";
    String lastText = "";

    for (int index = 0; index < tokens.size(); index++) {
        Object obj = tokens.get(index);

        if (obj instanceof PDFOperator) {
            PDFOperator op = (PDFOperator) obj;
            if (op.getOperation().equals("BT")) {
                concatStringPhase = true;
                polName = lastText;
                lastText = "";
            }
            else if (op.getOperation().equals("ET")) {
                concatStringPhase = false;
            }
        }
        else if (concatStringPhase && obj instanceof COSString) {
            COSString cosString = (COSString) obj;
            lastText += " " + cosString.getString();
            lastText = lastText.trim();
        }
        else if (!concatStringPhase && obj instanceof COSName) {
            COSName cosName = (COSName) obj;
            if (cosName.getName().startsWith("img")) {
                mapImgKeyToPolName.put(cosName.getName(), polName);
            }
        }
    }
}
 
Example 2
Project: my-cv-site   File: FormattedReader.java   Source Code and License Vote up 6 votes
/**
 * This will process all of the pages and the text that is in them.
 *
 * @param pages The pages object in the document.
 *
 * @throws IOException If there is an error parsing the text.
 */
protected void processPages( List<COSObjectable> pages ) throws IOException
{
    if( startBookmark != null )
    {
        startBookmarkPageNumber = getPageNumber( startBookmark, pages );
    }
    if( endBookmark != null )
    {
        endBookmarkPageNumber = getPageNumber( endBookmark, pages );
    }

    if( startBookmarkPageNumber == -1 && startBookmark != null &&
            endBookmarkPageNumber == -1 && endBookmark != null &&
            startBookmark.getCOSObject() == endBookmark.getCOSObject() )
    {
        //this is a special case where both the start and end bookmark
        //are the same but point to nothing.  In this case
        //we will not extract any text.
        startBookmarkPageNumber = 0;
        endBookmarkPageNumber = 0;
    }
    Iterator<COSObjectable> pageIter = pages.iterator();
    while( pageIter.hasNext() )
    {
        PDPage nextPage = (PDPage)pageIter.next();
        PDStream contentStream = nextPage.getContents();
        currentPageNo++;
        if( contentStream != null )
        {
            COSStream contents = contentStream.getStream();
            processPage( nextPage, contents );
        }
    }
}