org.apache.pdfbox.util.PDFTextStripper Java Examples

The following examples show how to use org.apache.pdfbox.util.PDFTextStripper. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: PDFReader.java From swcv with MIT License

6 votes

private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}

Example #2

Source File: PDFbox.java From wandora with GNU General Public License v3.0

6 votes

public static String extractTextOutOfPDF(String url) {
    PDDocument doc = null;
    try {
        if(url.startsWith("file:")) {
            doc = PDDocument.load(new File(url));
        }
        else {
            doc = PDDocument.load(new URL(url));
        }
        PDFTextStripper stripper = new PDFTextStripper();
        String content = stripper.getText(doc);
        doc.close();
        return content;
    }
    catch(Exception e) {
        e.printStackTrace();
    }
    return null;
}

Example #3

Source File: Chapter8.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingPDFBox() {
    try {
        File file = new File("TestDocument.pdf");
        PDDocument pdDocument = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pdDocument);
        System.out.println(text);
        pdDocument.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Example #4

Source File: PdfDocument.java From olat with Apache License 2.0

5 votes

private String extractText(final VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebugEnabled()) {
        log.debug("readContent from pdf starts...");
    }
    PDDocument document = null;
    InputStream is = null;
    try {
        is = leaf.getInputStream();
        document = PDDocument.load(is);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (final Exception e) {
                throw new DocumentAccessException("PDF is encrypted. Can not read content file=" + leaf.getName());
            }
        }
        if (log.isDebugEnabled()) {
            log.debug("readContent PDDocument loaded");
        }
        final PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(document);
    } finally {
        if (document != null) {
            document.close();
        }
        if (is != null) {
            is.close();
        }
        // needed to prevent potential OutOfMemoryError
        // https://issues.apache.org/jira/browse/PDFBOX-1009
        PDFont.clearResources();
    }
}

Example #5

Source File: PDFIndexerTest.java From carbon-apimgt with Apache License 2.0

5 votes

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);

    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

}

Example #6

Source File: PdfAssertions.java From batchers with Apache License 2.0

5 votes

private String getPdfText() {
    try {
        return new PDFTextStripper().getText(actual);
    } catch (IOException shouldNotHappen) {
        throw new RuntimeException(shouldNotHappen);
    } finally {
        try {
            actual.close();
        } catch (IOException e) {
            // we don't care
        }
    }
}

Example #7

Source File: TxtCreator.java From pdf-converter with Apache License 2.0

4 votes

public void process(File pdf, File output){
    PDDocument pdDoc;
    try {//Kudos for closing: http://stackoverflow.com/questions/156508/closing-a-java-fileinputstream
        File tmpfile = File.createTempFile(String.format("txttmp-%s", UUID.randomUUID().toString()), null);
        RandomAccessFile raf = new RandomAccessFile(tmpfile, "rw");
        pdDoc = PDDocument.loadNonSeq(pdf, raf);
        FileWriter writer = new FileWriter(output);
        try {
            PDFTextStripper stripper = new PDFTextStripper();
            int numberOfPages = pdDoc.getNumberOfPages();

            for (int j = 1; j < numberOfPages+1; j++) {
                stripper.setStartPage(j);
                stripper.setEndPage(j);
                writer.write(stripper.getText(pdDoc));
                writer.flush();
            }
        } finally {
            pdDoc.close();
            raf.close();
            tmpfile.delete();
            writer.close();
        }
    } catch (IOException ioe) {
        log.warn(String.format("Failed to create txt for file: %s", pdf.getName()), ioe);
    }
}

Example #8

Source File: PDFIndexer.java From carbon-apimgt with Apache License 2.0

4 votes

protected PDFTextStripper getPdfTextStripper() throws IOException {
	return new PDFTextStripper();
}

Example #9

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) {
    this.pdfParser = pdfParser;
    this.pdfTextStripper = stripper;
}

Example #10

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

@Override
protected PDFTextStripper getPdfTextStripper() throws IOException {
    return this.pdfTextStripper;
}