org.apache.pdfbox.util.PDFTextStripper Java Examples
The following examples show how to use
org.apache.pdfbox.util.PDFTextStripper.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PDFReader.java From swcv with MIT License | 6 votes |
private boolean getFile(String url) { try { URL u = new URL(url); URLConnection con = u.openConnection(); InputStream in = con.getInputStream(); PDFParser p = new PDFParser(in); p.parse(); PDDocument pdoc = new PDDocument(p.getDocument()); PDFTextStripper pts = new PDFTextStripper(); text = pts.getText(pdoc); pdoc.close(); return true; } catch (Exception e) { e.printStackTrace(); return false; } }
Example #2
Source File: PDFbox.java From wandora with GNU General Public License v3.0 | 6 votes |
public static String extractTextOutOfPDF(String url) { PDDocument doc = null; try { if(url.startsWith("file:")) { doc = PDDocument.load(new File(url)); } else { doc = PDDocument.load(new URL(url)); } PDFTextStripper stripper = new PDFTextStripper(); String content = stripper.getText(doc); doc.close(); return content; } catch(Exception e) { e.printStackTrace(); } return null; }
Example #3
Source File: Chapter8.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingPDFBox() { try { File file = new File("TestDocument.pdf"); PDDocument pdDocument = PDDocument.load(file); PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(pdDocument); System.out.println(text); pdDocument.close(); } catch (IOException ex) { ex.printStackTrace(); } }
Example #4
Source File: PdfDocument.java From olat with Apache License 2.0 | 5 votes |
private String extractText(final VFSLeaf leaf) throws IOException, DocumentAccessException { if (log.isDebugEnabled()) { log.debug("readContent from pdf starts..."); } PDDocument document = null; InputStream is = null; try { is = leaf.getInputStream(); document = PDDocument.load(is); if (document.isEncrypted()) { try { document.decrypt(""); } catch (final Exception e) { throw new DocumentAccessException("PDF is encrypted. Can not read content file=" + leaf.getName()); } } if (log.isDebugEnabled()) { log.debug("readContent PDDocument loaded"); } final PDFTextStripper stripper = new PDFTextStripper(); return stripper.getText(document); } finally { if (document != null) { document.close(); } if (is != null) { is.close(); } // needed to prevent potential OutOfMemoryError // https://issues.apache.org/jira/browse/PDFBOX-1009 PDFont.clearResources(); } }
Example #5
Source File: PDFIndexerTest.java From carbon-apimgt with Apache License 2.0 | 5 votes |
@Test public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException { String mediaType = "application/pdf+test"; final String MEDIA_TYPE = "mediaType"; PDFParser parser = Mockito.mock(PDFParser.class); COSDocument cosDoc = Mockito.mock(COSDocument.class); PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class); Mockito.doThrow(IOException.class).when(cosDoc).close(); Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc); Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn(""); PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper); // should return the default media type when media type is not defined in file2Index IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index); if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) { Assert.fail(); } // should return the media type we have set in the file2Index even if error occurs in finally block file2Index.mediaType = mediaType; pdf = pdfIndexer.getIndexedDocument(file2Index); if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) { Assert.fail(); } }
Example #6
Source File: PdfAssertions.java From batchers with Apache License 2.0 | 5 votes |
private String getPdfText() { try { return new PDFTextStripper().getText(actual); } catch (IOException shouldNotHappen) { throw new RuntimeException(shouldNotHappen); } finally { try { actual.close(); } catch (IOException e) { // we don't care } } }
Example #7
Source File: TxtCreator.java From pdf-converter with Apache License 2.0 | 4 votes |
public void process(File pdf, File output){ PDDocument pdDoc; try {//Kudos for closing: http://stackoverflow.com/questions/156508/closing-a-java-fileinputstream File tmpfile = File.createTempFile(String.format("txttmp-%s", UUID.randomUUID().toString()), null); RandomAccessFile raf = new RandomAccessFile(tmpfile, "rw"); pdDoc = PDDocument.loadNonSeq(pdf, raf); FileWriter writer = new FileWriter(output); try { PDFTextStripper stripper = new PDFTextStripper(); int numberOfPages = pdDoc.getNumberOfPages(); for (int j = 1; j < numberOfPages+1; j++) { stripper.setStartPage(j); stripper.setEndPage(j); writer.write(stripper.getText(pdDoc)); writer.flush(); } } finally { pdDoc.close(); raf.close(); tmpfile.delete(); writer.close(); } } catch (IOException ioe) { log.warn(String.format("Failed to create txt for file: %s", pdf.getName()), ioe); } }
Example #8
Source File: PDFIndexer.java From carbon-apimgt with Apache License 2.0 | 4 votes |
protected PDFTextStripper getPdfTextStripper() throws IOException { return new PDFTextStripper(); }
Example #9
Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0 | 4 votes |
public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) { this.pdfParser = pdfParser; this.pdfTextStripper = stripper; }
Example #10
Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0 | 4 votes |
@Override protected PDFTextStripper getPdfTextStripper() throws IOException { return this.pdfTextStripper; }