org.apache.pdfbox.pdfparser.PDFParser Java Examples

The following examples show how to use org.apache.pdfbox.pdfparser.PDFParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: PdfBleachSession.java From DocBleach with MIT License

6 votes

@SuppressFBWarnings(
    value = "EXS_EXCEPTION_SOFTENING_RETURN_FALSE",
    justification = "This method is an helper to check the password")
private PDDocument testPassword(ScratchFile inFile, RandomAccessRead source, String password)
    throws IOException {
  PDFParser parser = new PDFParser(source, password, inFile);
  try {
    parser.parse();
    return parser.getPDDocument();
  } catch (InvalidPasswordException e) {
    LOGGER.error("The tested password is invalid");
    return null;
  } finally {
    source.rewind((int) source.getPosition());
  }
}

Example #2

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

6 votes

private static PDDocument load(RandomAccessBufferedFileInputStream raFile, String password,
                               InputStream keyStore, String alias,
                               MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}

Example #3

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

6 votes

/**
 * Parses a PDF. Depending on the memory settings parameter the given input stream is either
 * copied to memory or to a temporary file to enable random access to the pdf.
 *
 * @param input stream that contains the document. Don't forget to close it after loading.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(InputStream input, String password, InputStream keyStore, 
                              String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        RandomAccessRead source = scratchFile.createBuffer(input);
        PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}

Example #4

Source File: PDFReader.java From swcv with MIT License

6 votes

private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}

Example #5

Source File: ExtractTextTools.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Example #6

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Example #7

Source File: ExtractTextHelper.java From o2oa with GNU Affero General Public License v3.0

5 votes

public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}

Example #8

Source File: SignatureOptions.java From gcs with Mozilla Public License 2.0

5 votes

private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException
{
    pdfSource = rar;
    PDFParser parser = new PDFParser(pdfSource);
    parser.parse();
    visualSignature = parser.getDocument();
}

Example #9

Source File: PDFIndexer.java From carbon-apimgt with Apache License 2.0

5 votes

protected PDFParser getPdfParser(File2Index fileData) throws IOException {
	return new PDFParser(new ByteArrayInputStream(fileData.data));
}

Example #10

Source File: PDFIndexerTest.java From carbon-apimgt with Apache License 2.0

5 votes

@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);

    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

}

Example #11

Source File: ShrinkPDF.java From shrink-pdf with MIT License

5 votes

/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}

Example #12

Source File: PDF2TextExample.java From tutorials with MIT License

5 votes

private static void generateTxtFromPDF(String filename) throws IOException {
	File f = new File(filename);
	String parsedText;
	PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();

	PDFTextStripper pdfStripper = new PDFTextStripper();
	PDDocument pdDoc = new PDDocument(cosDoc);

	parsedText = pdfStripper.getText(pdDoc);

	if (cosDoc != null)
		cosDoc.close();
	if (pdDoc != null)
		pdDoc.close();

	PrintWriter pw = new PrintWriter("src/output/pdf.txt");
	pw.print(parsedText);
	pw.close();
}

Example #13

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) {
    this.pdfParser = pdfParser;
    this.pdfTextStripper = stripper;
}

Example #14

Source File: PDFIndexerWrapper.java From carbon-apimgt with Apache License 2.0

4 votes

@Override
protected PDFParser getPdfParser(AsyncIndexer.File2Index fileData) throws IOException {
    return pdfParser;
}

Example #15

Source File: PDDocument.java From gcs with Mozilla Public License 2.0

3 votes

/**
 * Parses a PDF.
 * 
 * @param input byte array that contains the document.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(byte[] input, String password, InputStream keyStore, 
        String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    RandomAccessRead source = new RandomAccessBuffer(input);
    PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
    parser.parse();
    return parser.getPDDocument();
}