package com.lexpredict.tika; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.parser.pdf.PDFParserConfig; import org.junit.Test; import java.io.InputStream; import static org.junit.Assert.*; public class AlterPDFParserTest extends TikaTest { @Test public void testDoubleSpacedText() throws Exception { PDFParser pdfParser = new AlterPDFParser(); ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); context.set(PDFParserConfig.class, config); InputStream stream = AlterPDFParserTest.class.getResourceAsStream("/test-documents/double_space_test.pdf"); String text = getText(stream, pdfParser, context); stream.close(); assertTrue(text.length() > 100); } @Test public void testParseSimpleScannedText() throws Exception { String text = getTextFromDoc("/test-documents/text_on_white.pdf", AlterPDFParser.ParsePdfMode.PDF_OCR); assertTrue(text.length() > 50); } @Test public void testParseTransparentScannedText() throws Exception { String text = getTextFromDoc("/test-documents/transp_scanned.pdf", AlterPDFParser.ParsePdfMode.PDF_OCR); assertTrue(text.length() > 50); } private String getTextFromDoc(String docPath, AlterPDFParser.ParsePdfMode parseMode) throws Exception { AlterPDFParser pdfParser = new AlterPDFParser(); pdfParser.defaultParseMode = parseMode; ParseContext context = new ParseContext(); PDFParserConfig config = new PDFParserConfig(); context.set(PDFParserConfig.class, config); InputStream stream = AlterPDFParserTest.class.getResourceAsStream(docPath); String text = getText(stream, pdfParser, context); stream.close(); return text; } }