java source code of ExtractImages

package mkl.testarea.pdfbox2.extract;

import java.awt.geom.Point2D;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import javax.imageio.ImageIO;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.junit.BeforeClass;
import org.junit.Test;

/**
 * @author mkl
 */
public class ExtractImages
{
    final static File RESULT_FOLDER = new File("target/test-outputs", "extract");

    @BeforeClass
    public static void setUpBeforeClass() throws Exception
    {
        RESULT_FOLDER.mkdirs();
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/file/d/0B9izTHWJQ7xlT2ZoQkJfbGRYcFE">
     * 10948.pdf
     * </a>
     * <p>
     * The only special thing about the two images returned for the sample PDF is that
     * one image is merely a mask used for the other image, and the other image is the
     * actual image used on the PDF page. If one only wants the images immediately used
     * in the page content, one also has to scan the page content.
     * </p>
     */
    @Test
    public void testExtractPageImageResources10948() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("10948.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            int page = 1;
            for (PDPage pdPage : document.getPages())
            {
                PDResources resources = pdPage.getResources();
                if (resource != null)
                {
                    int index = 0;
                    for (COSName cosName : resources.getXObjectNames())
                    {
                        PDXObject xobject = resources.getXObject(cosName);
                        if (xobject instanceof PDImageXObject)
                        {
                            PDImageXObject image = (PDImageXObject)xobject;
                            File file = new File(RESULT_FOLDER, String.format("10948-%s-%s.%s", page, index, image.getSuffix()));
                            ImageIO.write(image.getImage(), image.getSuffix(), file);
                            index++;
                        }
                    }
                }
                page++;
            }
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
     * 10948.pdf
     * </a>, renamed "10948-new.pdf" here to prevent a collision
     * <p>
     * Here the code extracts no image at all because the images are not immediate page
     * resources but wrapped in form xobjects.
     * </p>
     */
    @Test
    public void testExtractPageImageResources10948New() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            int page = 1;
            for (PDPage pdPage : document.getPages())
            {
                PDResources resources = pdPage.getResources();
                if (resource != null)
                {
                    int index = 0;
                    for (COSName cosName : resources.getXObjectNames())
                    {
                        PDXObject xobject = resources.getXObject(cosName);
                        if (xobject instanceof PDImageXObject)
                        {
                            PDImageXObject image = (PDImageXObject)xobject;
                            File file = new File(RESULT_FOLDER, String.format("10948-new-%s-%s.%s", page, index, image.getSuffix()));
                            ImageIO.write(image.getImage(), image.getSuffix(), file);
                            index++;
                        }
                    }
                }
                page++;
            }
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
     * 10948.pdf
     * </a>, renamed "10948-new.pdf" here to prevent a collision
     * <p>
     * The PDFBox tool {@link org.apache.pdfbox.tools.ExtractImages} does extract
     * the images (if you have included com.github.jai-imageio:jai-imageio-core
     * that is). Unfortunately it does not include the page it finds the respective
     * images on.
     * </p>
     */
    @Test
    public void testExtractPageImagesTool10948New() throws IOException
    {
        org.apache.pdfbox.tools.ExtractImages.main(new String[]{ "-prefix", new File(RESULT_FOLDER, "10948-new-tool").toString(),
                "src/test/resources/mkl/testarea/pdfbox2/extract/10948-new.pdf" });
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlYi1XN1BxMmZEUGc">
     * 10948.pdf
     * </a>, renamed "10948-new.pdf" here to prevent a collision
     * <p>
     * Here we adopt the technique from the PDFBox tool {@link org.apache.pdfbox.tools.ExtractImages}
     * and name the exported images properly.
     * </p>
     */
    @Test
    public void testExtractPageImages10948New() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("10948-new.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            extractPageImages(document, "10948-new-engine-%s-%s%s.%s");
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlY1lpVERlZm9kRkk">
     * t1_edited.pdf
     * </a>
     * <p>
     * In contrast to the OP's observation the extraction works properly here.
     * </p>
     */
    @Test
    public void testExtractPageImagesT1Edited() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("t1_edited.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            extractPageImages(document, "t1_edited-engine-%s-%s%s.%s");
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xldkVPRHFWU1picGs">
     * 1604-Orange_flat_2_edited.pdf
     * </a>
     * <p>
     * This image comes in stripes. Nothing unusual.
     * </p>
     */
    @Test
    public void testExtractPageImages1604OrangeFlat2Edited() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("1604-Orange_flat_2_edited.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            extractPageImages(document, "1604-Orange_flat_2_edited-engine-%s-%s%s.%s");
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <br/>
     * <a href="https://drive.google.com/open?id=0B9izTHWJQ7xlQkRHMmtIU2ZPUDA">
     * test_fact.pdf
     * </a>
     * <p>
     * The logo actually is the only bitmp image here.
     * </p>
     */
    @Test
    public void testExtractPageImagesTestFact() throws IOException
    {
        try (   InputStream resource = getClass().getResourceAsStream("test_fact.pdf"))
        {
            PDDocument document = Loader.loadPDF(resource);
            extractPageImages(document, "test_fact-engine-%s-%s%s.%s");
        }
    }

    /**
     * <a href="http://stackoverflow.com/questions/40531871/how-can-i-check-if-pdf-page-is-imagescanned-by-pdfbox-xpdf">
     * How can I check if PDF page is image(scanned) by PDFBOX, XPDF
     * </a>
     * <p>
     * Here we adopt the technique from the PDFBox tool {@link org.apache.pdfbox.tools.ExtractImages}
     * and name the exported images properly.
     * </p>
     */
    void extractPageImages(PDDocument document, String fileNameFormat) throws IOException
    {
        int page = 1;
        for (final PDPage pdPage : document.getPages())
        {
            final int currentPage = page;
            PDFGraphicsStreamEngine pdfGraphicsStreamEngine = new PDFGraphicsStreamEngine(pdPage)
            {
                int index = 0;
                
                @Override
                public void drawImage(PDImage pdImage) throws IOException
                {
                    if (pdImage instanceof PDImageXObject)
                    {
                        Matrix ctm = getGraphicsState().getCurrentTransformationMatrix();
                        String flips = "";
                        if (ctm.getScaleX() < 0)
                            flips += "h";
                        if (ctm.getScaleY() < 0)
                            flips += "v";
                        if (flips.length() > 0)
                            flips = "-" + flips;
                        PDImageXObject image = (PDImageXObject)pdImage;
                        File file = new File(RESULT_FOLDER, String.format(fileNameFormat, currentPage, index, flips, image.getSuffix()));
                        ImageIOUtil.writeImage(image.getImage(), image.getSuffix(), new FileOutputStream(file));
                        index++;
                    }
                }

                @Override
                public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException { }

                @Override
                public void clip(int windingRule) throws IOException { }

                @Override
                public void moveTo(float x, float y) throws IOException {  }

                @Override
                public void lineTo(float x, float y) throws IOException { }

                @Override
                public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {  }

                @Override
                public Point2D getCurrentPoint() throws IOException { return null; }

                @Override
                public void closePath() throws IOException { }

                @Override
                public void endPath() throws IOException { }

                @Override
                public void strokePath() throws IOException { }

                @Override
                public void fillPath(int windingRule) throws IOException { }

                @Override
                public void fillAndStrokePath(int windingRule) throws IOException { }

                @Override
                public void shadingFill(COSName shadingName) throws IOException { }
            };
            pdfGraphicsStreamEngine.processPage(pdPage);
            page++;
        }
    }
}