Java Code Examples for org.apache.pdfbox.pdfparser.PDFParser

The following examples show how to use org.apache.pdfbox.pdfparser.PDFParser. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: DocBleach   Source File: PdfBleachSession.java    License: MIT License 6 votes vote down vote up
@SuppressFBWarnings(
    value = "EXS_EXCEPTION_SOFTENING_RETURN_FALSE",
    justification = "This method is an helper to check the password")
private PDDocument testPassword(ScratchFile inFile, RandomAccessRead source, String password)
    throws IOException {
  PDFParser parser = new PDFParser(source, password, inFile);
  try {
    parser.parse();
    return parser.getPDDocument();
  } catch (InvalidPasswordException e) {
    LOGGER.error("The tested password is invalid");
    return null;
  } finally {
    source.rewind((int) source.getPosition());
  }
}
 
Example 2
Source Project: gcs   Source File: PDDocument.java    License: Mozilla Public License 2.0 6 votes vote down vote up
private static PDDocument load(RandomAccessBufferedFileInputStream raFile, String password,
                               InputStream keyStore, String alias,
                               MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        PDFParser parser = new PDFParser(raFile, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}
 
Example 3
Source Project: gcs   Source File: PDDocument.java    License: Mozilla Public License 2.0 6 votes vote down vote up
/**
 * Parses a PDF. Depending on the memory settings parameter the given input stream is either
 * copied to memory or to a temporary file to enable random access to the pdf.
 *
 * @param input stream that contains the document. Don't forget to close it after loading.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(InputStream input, String password, InputStream keyStore, 
                              String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    try
    {
        RandomAccessRead source = scratchFile.createBuffer(input);
        PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
        parser.parse();
        return parser.getPDDocument();
    }
    catch (IOException ioe)
    {
        IOUtils.closeQuietly(scratchFile);
        throw ioe;
    }
}
 
Example 4
Source Project: carbon-apimgt   Source File: PDFIndexerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testShouldReturnIndexedDocumentWhenParameterCorrect() throws IOException {
    String mediaType = "application/pdf+test";
    final String MEDIA_TYPE = "mediaType";
    PDFParser parser = Mockito.mock(PDFParser.class);
    COSDocument cosDoc = Mockito.mock(COSDocument.class);
    PDFTextStripper pdfTextStripper = Mockito.mock(PDFTextStripper.class);
    Mockito.doThrow(IOException.class).when(cosDoc).close();
    Mockito.when(parser.getDocument()).thenReturn(new COSDocument()).thenReturn(cosDoc);
    Mockito.when(pdfTextStripper.getText(new PDDocument())).thenReturn("");
    PDFIndexer pdfIndexer = new PDFIndexerWrapper(parser, pdfTextStripper);

    // should return the default media type when media type is not defined in file2Index
    IndexDocument pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!"application/pdf".equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

    // should return the media type we have set in the file2Index even if error occurs in finally block
    file2Index.mediaType = mediaType;
    pdf = pdfIndexer.getIndexedDocument(file2Index);
    if (!mediaType.equals(pdf.getFields().get(MEDIA_TYPE).get(0))) {
        Assert.fail();
    }

}
 
Example 5
Source Project: swcv   Source File: PDFReader.java    License: MIT License 6 votes vote down vote up
private boolean getFile(String url)
{
    try
    {
        URL u = new URL(url);
        URLConnection con = u.openConnection();
        InputStream in = con.getInputStream();
        PDFParser p = new PDFParser(in);
        p.parse();
        PDDocument pdoc = new PDDocument(p.getDocument());
        PDFTextStripper pts = new PDFTextStripper();
        text = pts.getText(pdoc);
        pdoc.close();

        return true;
    }
    catch (Exception e)
    {
        e.printStackTrace();
        return false;
    }
}
 
Example 6
Source Project: tutorials   Source File: PDF2TextExample.java    License: MIT License 6 votes vote down vote up
private static void generateTxtFromPDF(String filename) throws IOException {
	File f = new File(filename);
	String parsedText;
	PDFParser parser = new PDFParser(new RandomAccessFile(f, "r"));
	parser.parse();

	COSDocument cosDoc = parser.getDocument();

	PDFTextStripper pdfStripper = new PDFTextStripper();
	PDDocument pdDoc = new PDDocument(cosDoc);

	parsedText = pdfStripper.getText(pdDoc);

	if (cosDoc != null)
		cosDoc.close();
	if (pdDoc != null)
		pdDoc.close();

	PrintWriter pw = new PrintWriter("src/output/pdf.txt");
	pw.print(parsedText);
	pw.close();
}
 
Example 7
Source Project: o2oa   Source File: ExtractTextTools.java    License: GNU Affero General Public License v3.0 5 votes vote down vote up
public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}
 
Example 8
public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}
 
Example 9
public static String pdf(byte[] bytes) {
	try {
		PDFParser parser = new PDFParser(new RandomAccessBuffer(bytes));
		parser.parse();
		try (COSDocument cos = parser.getDocument(); PDDocument pd = new PDDocument(cos)) {
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setStartPage(1);
			stripper.setEndPage(pd.getNumberOfPages());
			return stripper.getText(pd);
		}
	} catch (Exception e) {
		logger.error(e);
	}
	return null;
}
 
Example 10
Source Project: gcs   Source File: SignatureOptions.java    License: Mozilla Public License 2.0 5 votes vote down vote up
private void initFromRandomAccessRead(RandomAccessRead rar) throws IOException
{
    pdfSource = rar;
    PDFParser parser = new PDFParser(pdfSource);
    parser.parse();
    visualSignature = parser.getDocument();
}
 
Example 11
Source Project: shrink-pdf   Source File: ShrinkPDF.java    License: MIT License 5 votes vote down vote up
/**
 * Shrink a PDF
 * @param f {@code File} pointing to the PDF to shrink
 * @param compQual Compression quality parameter. 0 is
 *                 smallest file, 1 is highest quality.
 * @return The compressed {@code PDDocument}
 * @throws FileNotFoundException
 * @throws IOException 
 */
private PDDocument shrinkMe() 
        throws FileNotFoundException, IOException {
     if(compQual < 0)
         compQual = compQualDefault;
     final RandomAccessBufferedFileInputStream rabfis = 
             new RandomAccessBufferedFileInputStream(input);
     final PDFParser parser = new PDFParser(rabfis);
     parser.parse();
     final PDDocument doc = parser.getPDDocument();
     final PDPageTree pages = doc.getPages();
     final ImageWriter imgWriter;
     final ImageWriteParam iwp;
     if(tiff) {
         final Iterator<ImageWriter> tiffWriters =
               ImageIO.getImageWritersBySuffix("png");
         imgWriter = tiffWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         //iwp.setCompressionMode(ImageWriteParam.MODE_DISABLED);
     } else {
         final Iterator<ImageWriter> jpgWriters = 
               ImageIO.getImageWritersByFormatName("jpeg");
         imgWriter = jpgWriters.next();
         iwp = imgWriter.getDefaultWriteParam();
         iwp.setCompressionMode(ImageWriteParam.MODE_EXPLICIT);
         iwp.setCompressionQuality(compQual);
     }
     for(PDPage p : pages) {
          scanResources(p.getResources(), doc, imgWriter, iwp);
     }
     return doc;
}
 
Example 12
Source Project: carbon-apimgt   Source File: PDFIndexer.java    License: Apache License 2.0 4 votes vote down vote up
protected PDFParser getPdfParser(File2Index fileData) throws IOException {
	return new PDFParser(new ByteArrayInputStream(fileData.data));
}
 
Example 13
Source Project: carbon-apimgt   Source File: PDFIndexerWrapper.java    License: Apache License 2.0 4 votes vote down vote up
public PDFIndexerWrapper(PDFParser pdfParser, PDFTextStripper stripper) {
    this.pdfParser = pdfParser;
    this.pdfTextStripper = stripper;
}
 
Example 14
Source Project: carbon-apimgt   Source File: PDFIndexerWrapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected PDFParser getPdfParser(AsyncIndexer.File2Index fileData) throws IOException {
    return pdfParser;
}
 
Example 15
Source Project: gcs   Source File: PDDocument.java    License: Mozilla Public License 2.0 3 votes vote down vote up
/**
 * Parses a PDF.
 * 
 * @param input byte array that contains the document.
 * @param password password to be used for decryption
 * @param keyStore key store to be used for decryption when using public key security 
 * @param alias alias to be used for decryption when using public key security
 * @param memUsageSetting defines how memory is used for buffering input stream and PDF streams 
 * 
 * @return loaded document
 * 
 * @throws InvalidPasswordException If the password is incorrect.
 * @throws IOException In case of a reading or parsing error.
 */
public static PDDocument load(byte[] input, String password, InputStream keyStore, 
        String alias, MemoryUsageSetting memUsageSetting) throws IOException
{
    ScratchFile scratchFile = new ScratchFile(memUsageSetting);
    RandomAccessRead source = new RandomAccessBuffer(input);
    PDFParser parser = new PDFParser(source, password, keyStore, alias, scratchFile);
    parser.parse();
    return parser.getPDDocument();
}