org.apache.pdfbox.util.PDFTextStripper Java Examples

The following examples show how to use org.apache.pdfbox.util.PDFTextStripper. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PDFReader.java    From swcv with MIT License 6 votes vote down vote up
private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}
 
Example #2
Source File: PDFbox.java    From wandora with GNU General Public License v3.0 6 votes vote down vote up
public static String extractTextOutOfPDF(String url) {
    PDDocument doc = null;
    try {
        if(url.startsWith("file:")) {
            doc = PDDocument.load(new File(url));
        }
        else {
            doc = PDDocument.load(new URL(url));
        }
        PDFTextStripper stripper = new PDFTextStripper();
        String content = stripper.getText(doc);
        doc.close();
        return content;
    }
    catch(Exception e) {
        e.printStackTrace();
    }
    return null;
}
 
Example #3
Source File: Chapter8.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingPDFBox() {
    try {
        File file = new File("TestDocument.pdf");
        PDDocument pdDocument = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(pdDocument);
        System.out.println(text);
        pdDocument.close();
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #4
Source File: PdfDocument.java    From olat with Apache License 2.0 5 votes vote down vote up
private String extractText(final VFSLeaf leaf) throws IOException, DocumentAccessException {
    if (log.isDebugEnabled()) {
        log.debug("readContent from pdf starts...");
    }
    PDDocument document = null;
    InputStream is = null;
    try {
        is = leaf.getInputStream();
        document = PDDocument.load(is);
        if (document.isEncrypted()) {
            try {
                document.decrypt("");
            } catch (final Exception e) {
                throw new DocumentAccessException("PDF is encrypted. Can not read content file=" + leaf.getName());
            }
        }
        if (log.isDebugEnabled()) {
            log.debug("readContent PDDocument loaded");
        }
        final PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(document);
    } finally {
        if (document != null) {
            document.close();
        }
        if (is != null) {
            is.close();
        }
        // needed to prevent potential OutOfMemoryError
        // https://issues.apache.org/jira/browse/PDFBOX-1009
        PDFont.clearResources();
    }
}
 
Example #5
Source File: PDFIndexerTest.java    From carbon-apimgt with Apache License 2.0 5 votes vote down vote up
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);

    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

}
 
Example #6
Source File: PdfAssertions.java    From batchers with Apache License 2.0 5 votes vote down vote up
private String getPdfText() {
    try {
        return new PDFTextStripper().getText(actual);
    } catch (IOException shouldNotHappen) {
        throw new RuntimeException(shouldNotHappen);
    } finally {
        try {
            actual.close();
        } catch (IOException e) {
            // we don't care
        }
    }
}
 
Example #7
Source File: TxtCreator.java    From pdf-converter with Apache License 2.0 4 votes vote down vote up
public void process(File pdf, File output){
    PDDocument pdDoc;
    try {//Kudos for closing: http://stackoverflow.com/questions/156508/closing-a-java-fileinputstream
        File tmpfile = File.createTempFile(String.format("txttmp-%s", UUID.randomUUID().toString()), null);
        RandomAccessFile raf = new RandomAccessFile(tmpfile, "rw");
        pdDoc = PDDocument.loadNonSeq(pdf, raf);
        FileWriter writer = new FileWriter(output);
        try {
            PDFTextStripper stripper = new PDFTextStripper();
            int numberOfPages = pdDoc.getNumberOfPages();

            for (int j = 1; j < numberOfPages+1; j++) {
                stripper.setStartPage(j);
                stripper.setEndPage(j);
                writer.write(stripper.getText(pdDoc));
                writer.flush();
            }
        } finally {
            pdDoc.close();
            raf.close();
            tmpfile.delete();
            writer.close();
        }
    } catch (IOException ioe) {
        log.warn(String.format("Failed to create txt for file: %s", pdf.getName()), ioe);
    }
}
 
Example #8
Source File: PDFIndexer.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
protected PDFTextStripper getPdfTextStripper() throws IOException {
	return new PDFTextStripper();
}
 
Example #9
Source File: PDFIndexerWrapper.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) {
    this.pdfParser = pdfParser;
    this.pdfTextStripper = stripper;
}
 
Example #10
Source File: PDFIndexerWrapper.java    From carbon-apimgt with Apache License 2.0 4 votes vote down vote up
@Override
protected PDFTextStripper getPdfTextStripper() throws IOException {
    return this.pdfTextStripper;
}