Java Code Examples for org.apache.pdfbox.util.PDFTextStripper#getText()

The following examples show how to use org.apache.pdfbox.util.PDFTextStripper#getText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PDFReader.java From swcv with MIT License

6 votes

private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}

Example 2

Source File: PDFbox.java From wandora with GNU General Public License v3.0

6 votes

public static String extractTextOutOfPDF(String url) {
    PDDocument doc = null;
    try {
        if(url.startsWith("file:")) {
            doc = PDDocument.load(new File(url));
        }
        else {
            doc = PDDocument.load(new URL(url));
        }
        PDFTextStripper stripper = new PDFTextStripper();
        String content = stripper.getText(doc);
        doc.close();
        return content;
    }
    catch(Exception e) {
        e.printStackTrace();
    }
    return null;
}

Example 3

Source File: Chapter8.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingPDFBox() {
    try {
        File file = new File("TestDocument.pdf");
        PDDocument pdDocument = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pdDocument);
        System.out.println(text);
        pdDocument.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Example 4

Source File: PdfDocument.java From olat with Apache License 2.0

5 votes

private String extractText(final VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebugEnabled()) {
        log.debug("readContent from pdf starts...");
    }
    PDDocument document = null;
    InputStream is = null;
    try {
        is = leaf.getInputStream();
        document = PDDocument.load(is);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (final Exception e) {
                throw new DocumentAccessException("PDF is encrypted. Can not read content file=" + leaf.getName());
            }
        }
        if (log.isDebugEnabled()) {
            log.debug("readContent PDDocument loaded");
        }
        final PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(document);
    } finally {
        if (document != null) {
            document.close();
        }
        if (is != null) {
            is.close();
        }
        // needed to prevent potential OutOfMemoryError
        // https://issues.apache.org/jira/browse/PDFBOX-1009
        PDFont.clearResources();
    }
}