org.apache.pdfbox.pdfparser.PDFStreamParser Java Examples

The following examples show how to use org.apache.pdfbox.pdfparser.PDFStreamParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PDType3CharProc.java    From gcs with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Get the width from a type3 charproc stream.
 *
 * @return the glyph width.
 * @throws IOException if the stream could not be read, or did not have d0 or d1 as first
 * operator, or if their first argument was not a number.
 */
public float getWidth() throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(this);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            return parseWidth((Operator) token, arguments);
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
    throw new IOException("Unexpected end of stream");
}
 
Example #2
Source File: PDDefaultAppearanceString.java    From gcs with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Processes the operators of the given content stream.
 *
 * @param content the content to parse.
 * @throws IOException if there is an error reading or parsing the content stream.
 */
private void processAppearanceStringOperators(byte[] content) throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(content);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            processOperator((Operator) token, arguments);
            arguments = new ArrayList<COSBase>();
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
}
 
Example #3
Source File: PDFStreamEngine.java    From gcs with Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Processes the operators of the given content stream.
 *
 * @param contentStream to content stream to parse.
 * @throws IOException if there is an error reading or parsing the content stream.
 */
private void processStreamOperators(PDContentStream contentStream) throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(contentStream);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            processOperator((Operator) token, arguments);
            arguments = new ArrayList<COSBase>();
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
}
 
Example #4
Source File: PdfContentTypeChecker.java    From tika-server with Apache License 2.0 5 votes vote down vote up
private void calculateTextObjectsOnPage(PDPage page) throws IOException {
    PDFStreamParser parser = new PDFStreamParser(page);
    parser.parse();
    List<Object> pageTokens = parser.getTokens();
    for (Object token : pageTokens) {
        if (token instanceof Operator) {
            String opName = ((Operator) token).getName();
            if (opName.equals("BT")) // Begin Text
                textBlocks++;
        }
    }
}
 
Example #5
Source File: AppearanceGeneratorHelper.java    From gcs with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Parses an appearance stream into tokens.
 */
private List<Object> tokenize(PDAppearanceStream appearanceStream) throws IOException
{
    PDFStreamParser parser = new PDFStreamParser(appearanceStream);
    parser.parse();
    return parser.getTokens();
}
 
Example #6
Source File: NurminenDetectionAlgorithm.java    From tabula-java with MIT License 5 votes vote down vote up
private PDDocument removeText(PDPage page) throws IOException {

        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List<Object> tokens = parser.getTokens();
        List<Object> newTokens = new ArrayList<>();
        for (Object token : tokens) {
            if (token instanceof Operator) {
                Operator op = (Operator) token;
                if (op.getName().equals("TJ") || op.getName().equals("Tj")) {
                    //remove the one argument to this operator
                    newTokens.remove(newTokens.size() - 1);
                    continue;
                }
            }
            newTokens.add(token);
        }

        PDDocument document = new PDDocument();
        PDPage newPage = document.importPage(page);
        newPage.setResources(page.getResources());

        PDStream newContents = new PDStream(document);
        OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter writer = new ContentStreamWriter(out);
        writer.writeTokens(newTokens);
        out.close();
        newPage.setContents(newContents);
        return document;
    }
 
Example #7
Source File: PDType3CharProc.java    From gcs with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Calculate the bounding box of this glyph. This will work only if the first operator in the
 * stream is d1.
 *
 * @return the bounding box of this glyph, or null if the first operator is not d1.
 * @throws IOException If an io error occurs while parsing the stream.
 */
public PDRectangle getGlyphBBox() throws IOException
{
    List<COSBase> arguments = new ArrayList<COSBase>();
    PDFStreamParser parser = new PDFStreamParser(this);
    Object token = parser.parseNextToken();
    while (token != null)
    {
        if (token instanceof COSObject)
        {
            arguments.add(((COSObject) token).getObject());
        }
        else if (token instanceof Operator)
        {
            if (((Operator) token).getName().equals("d1") && arguments.size() == 6)
            {
                for (int i = 0; i < 6; ++i)
                {
                    if (!(arguments.get(i) instanceof COSNumber))
                    {
                        return null;
                    }
                }
                return new PDRectangle(
                        ((COSNumber) arguments.get(2)).floatValue(),
                        ((COSNumber) arguments.get(3)).floatValue(),
                        ((COSNumber) arguments.get(4)).floatValue() - ((COSNumber) arguments.get(2)).floatValue(),
                        ((COSNumber) arguments.get(5)).floatValue() - ((COSNumber) arguments.get(3)).floatValue());
            }
            else
            {
                return null;
            }
        }
        else
        {
            arguments.add((COSBase) token);
        }
        token = parser.parseNextToken();
    }
    return null;
}
 
Example #8
Source File: TestGraphicsCounter.java    From testarea-pdfbox2 with Apache License 2.0 4 votes vote down vote up
/**
 * <a href="http://stackoverflow.com/questions/28321374/how-to-get-page-content-height-using-pdfbox">
 * How to get page content height using pdfbox
 * </a>
 * <br/>
 * <a href="https://drive.google.com/file/d/0B65bQnJhC1mvbEVQQ0o0QU9STlU/view?usp=sharing">
 * test.pdf
 * </a>, here as <code>test-rivu.pdf</code>
 * <p>
 * Rivu's code from a comment to count lines etc.
 * </p>
 */
@Test
public void testCountTestLikeRivu() throws IOException
{
    try (InputStream resource = getClass().getResourceAsStream("test-rivu.pdf"))
    {
        System.out.println("test-rivu.pdf");
        PDDocument document = Loader.loadPDF(resource);

        PDPage page = document.getPage(4);
        PDFStreamParser parser = new PDFStreamParser(page.getContents());
        List<Object> tokens = parser.parse();
        int lines=0;
        int curves=0;
        int rectangles=0;
        int doOps=0;
        int clipPaths=0;
        for (Object token:tokens){
            if (token instanceof Operator) {
                Operator op=(Operator) token;
                if ("do".equals(op.getName()))
                    doOps+=1;
                else if ("W".equals(op.getName())|| "W*".equals(op.getName()))
                    clipPaths+=1;
                else if ("l".equals(op.getName()) || "h".equals(op.getName()))
                    lines+=1;
                else if ("c".equals(op.getName())||"y".equals(op.getName()) ||"v".equals(op.getName())){
                    System.out.println(op);
                    curves+=1;
                }
                else if ("re".equals(op.getName()))
                    rectangles+=1;


            }
        }
        System.out.println(lines + " lines, " + curves + " curves, " + rectangles + " rectangles, " + doOps + " xobjects, " + clipPaths + " clip paths");

        document.close();
    }
}