package mkl.testarea.itext5.extract; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Paths; import org.junit.BeforeClass; import org.junit.Test; import com.itextpdf.text.DocumentException; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextRenderInfo; /** * <a href="http://stackoverflow.com/questions/33500819/itextsharp-pdfbox-text-extract-fails-for-certain-pdfs"> * ITextSharp / PDFBox text extract fails for certain pdfs * </a> * <br/> * <a href="https://www.dropbox.com/s/8x5lnvmw6mv8ko8/Vol16_2.pdf?dl=0"> * Vol16_2.pdf * </a> * <p> * This test tests the evil {@link TextExtractionStrategy} wrapper * {@link RemappingExtractionFilter} which replaces the text in a * {@link TextRenderInfo} instance by mapping it using the Differences * of the Encoding of the font assuming the differences to contain a starting * 1 only followed by names all of which are built as /Gxx, xx being the * hexadecimal representation of the ASCII code (as Unicode subset) of the * glyph rendered. * </p> * <p> * It is only useful for documents like the one presented by the OP. * </p> * * @author mkl */ public class RemappedExtraction { final static File RESULT_FOLDER = new File("target/test-outputs", "extract"); @BeforeClass public static void setUpBeforeClass() throws Exception { RESULT_FOLDER.mkdirs(); } @Test public void testVol16_2() throws IOException, DocumentException, NoSuchFieldException, SecurityException { InputStream resourceStream = getClass().getResourceAsStream("Vol16_2.pdf"); try { PdfReader reader = new PdfReader(resourceStream); String content = extractAndStoreRemapped(reader, new File(RESULT_FOLDER, "Vol16_2.%s.txt").toString()); System.out.println("\nText Vol16_2.pdf\n************************"); System.out.println(content); System.out.println("************************"); } finally { if (resourceStream != null) resourceStream.close(); } } String extractAndStoreRemapped(PdfReader reader, String format) throws IOException, NoSuchFieldException, SecurityException { StringBuilder builder = new StringBuilder(); for (int page = 1; page <= reader.getNumberOfPages(); page++) { String pageText = extractRemapped(reader, page); Files.write(Paths.get(String.format(format, page)), pageText.getBytes("UTF8")); if (page > 1) builder.append("\n\n"); builder.append(pageText); } return builder.toString(); } String extractRemapped(PdfReader reader, int pageNo) throws IOException, NoSuchFieldException, SecurityException { TextExtractionStrategy strategy = new RemappingExtractionFilter(new LocationTextExtractionStrategy()); return PdfTextExtractor.getTextFromPage(reader, pageNo, strategy); } }