Python PyPDF2.PdfFileReader() Examples

The following are 30 code examples of PyPDF2.PdfFileReader(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module PyPDF2 , or try the search function

Example #1

Source File: uploader.py From calibre-web with GNU General Public License v3.0

9 votes

def pdf_meta(tmp_file_path, original_file_name, original_file_extension):
    doc_info = None
    if use_pdf_meta:
        doc_info = PdfFileReader(open(tmp_file_path, 'rb')).getDocumentInfo()

    if doc_info:
        author = doc_info.author if doc_info.author else u'Unknown'
        title = doc_info.title if doc_info.title else original_file_name
        subject = doc_info.subject
    else:
        author = u'Unknown'
        title = original_file_name
        subject = ""

    return BookMeta(
        file_path=tmp_file_path,
        extension=original_file_extension,
        title=title,
        author=author,
        cover=pdf_preview(tmp_file_path, original_file_name),
        description=subject,
        tags="",
        series="",
        series_id="",
        languages="")

Example #2

Source File: action.py From insightconnect-plugins with MIT License

8 votes

def run(self, params={}):
        try:
            if params.get('contents'):
                pdfFile = base64.b64decode(params.get('contents'))
            else:
                raise Exception("File contents missing!")
        except Exception as e:
            self.logger.error("File contents missing: ", e)
            raise
        try:
            with open("temp.pdf", 'wb') as temp_pdf:
                temp_pdf.write(pdfFile)
                pdfReader = PyPDF2.PdfFileReader(open('temp.pdf', 'rb'))
                pdftext = ""
                for page in range(pdfReader.numPages):
                    pageObj = pdfReader.getPage(page)
                    pdftext += pageObj.extractText().replace('\n','')
        except Exception as e:
            self.logger.info("An error occurred while extracting text: ", e)
            raise
        return {"output": pdftext}

Example #3

Source File: utils.py From open-syllabus-project with Apache License 2.0

8 votes

def pdf_date(path):

    """
    Extract a date from PDF file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    reader = PdfFileReader(path)

    # Get rid of `D:` prefix and timezone.
    stamp = reader.documentInfo['/CreationDate']
    match = re.search('\d+', stamp)

    return datetime.strptime(
        match.group(),
        '%Y%m%d%H%M%S'
    )

Example #4

Source File: pdfParanoia.py From automate-the-boring-stuff-projects with MIT License

7 votes

def encryptPDFs(root, password):
    """Encrypts all pdfs folder walk
       Args:
          root (str): folder path to walk
          password (str): password to encrypt pdfs with
       Returns:
          None
    """
    for folder, subfolder, fileList in os.walk(root):
        for file in fileList:
            if file.endswith('.pdf'):
                filepath = os.path.join(os.path.abspath(folder), file)
                pdfFileObj = open(filepath, 'rb')
                pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

                if not pdfReader.isEncrypted:
                    pdfWriter = PyPDF2.PdfFileWriter()
                    for pageNum in range(pdfReader.numPages):
                        pdfWriter.addPage(pdfReader.getPage(pageNum))
                    pdfWriter.encrypt(password)
                    newPath = os.path.dirname(filepath) + '/untitled folder/' + \
                              ('_encrypted.'.join(os.path.basename(filepath).split('.')))
                    resultPdf = open(newPath, 'wb')
                    pdfWriter.write(resultPdf)
                    resultPdf.close()

Example #5

Source File: RastLeak_1_2.py From RastLeak with GNU General Public License v3.0

7 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)
	#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######

Example #6

Source File: testpdf.py From opencanary with BSD 3-Clause "New" or "Revised" License

6 votes

def createPDF(self, name=None, size='10kb'):
        from PyPDF2 import PdfFileReader, PdfFileWriter
        from fpdf import FPDF
        import os
        import random
        name = os.path.basename(name)
        tmp_name = '/tmp/' + name
        output_name = self.sharepath + '/' + name

        if size == '10kb':
            randlength = random.randint(10000,90000)
        elif size == '100kb':
            randlength = random.randint(100000,900000)
        elif size == '1mb':
            randlength = random.randint(1000000,9000000)

        #create file
        pdf=FPDF()
        pdf.add_page()
        pdf.set_font('Arial','B',8)
        pdf.cell(0,0,os.urandom(randlength))
        pdf.output(tmp_name, "F")

        #encrypt it
        output = PdfFileWriter()
        input1 = PdfFileReader(open(tmp_name, "rb"))
        output.encrypt(user_pwd="ihasapass")
        output.addPage(input1.getPage(0))

        outputStream = file(output_name, "wb")
        output.write(outputStream)
        outputStream.close()

Example #7

Source File: grepper.py From dftimewolf with Apache License 2.0

6 votes

def GrepPDF(self, path):
    """Parses a PDF files text content for keywords.

    Args:
      path (str): PDF file path.

    Returns:
      set[str]: unique occurrences of every match.
    """
    with open(path, 'rb') as pdf_file_obj:
      matches = set()
      text = ''
      pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
      pages = pdf_reader.numPages
      for page in range(pages):
        page_obj = pdf_reader.getPage(page)
        text += '\n' + page_obj.extractText()
      matches.update(set(x.lower() for x in re.findall(
          self._keywords, text, re.IGNORECASE)))
    return matches

Example #8

Source File: rastleak_1_4.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######

Example #9

Source File: RastLeak_1_3.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)
	#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######

Example #10

Source File: downloadfiles.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######

Example #11

Source File: rastleak_1_4.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######

Example #12

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def merge_pdf(input_folder, output_file):
    pdf2merge = []
    for filename in os.listdir(input_folder):
        #print(filename)
        if filename.endswith('.pdf'):
            pdf2merge.append(filename)
            
    pdf2merge.sort()
    
    
    pdfWriter = PyPDF2.PdfFileWriter()
    for filename in pdf2merge:
        pdfFileObj = open(input_folder+"/"+filename,'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        for pageNum in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(pageNum)
            pdfWriter.addPage(pageObj)
            
    pdfOutput = open(output_file+'.pdf', 'wb')
    pdfWriter.write(pdfOutput)
    #Outputting the PDF
    pdfOutput.close()

Example #13

Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License

6 votes

def merge_pdf(input_folder, output_file):
    pdf2merge = []
    for filename in os.listdir(input_folder):
        #print(filename)
        if filename.endswith('.pdf'):
            pdf2merge.append(filename)
            
    pdf2merge.sort()
    
    
    pdfWriter = PyPDF2.PdfFileWriter()
    for filename in pdf2merge:
        pdfFileObj = open(input_folder+"/"+filename,'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        for pageNum in range(pdfReader.numPages):
            pageObj = pdfReader.getPage(pageNum)
            pdfWriter.addPage(pageObj)
            
    pdfOutput = open(output_file+'.pdf', 'wb')
    pdfWriter.write(pdfOutput)
    #Outputting the PDF
    pdfOutput.close()

Example #14

Source File: rastleak_2_0.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######

Example #15

Source File: pdf.py From refextract with GNU General Public License v2.0

6 votes

def _destinations_in_two_columns(pdf, destinations, cutoff=3):
    """
    Check if the named destinations are organized along two columns (heuristic)

    @param pdf: a PdfFileReader object
    @param destinations:

    'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
    would-be second column start at the same position, return True
    """
    # iterator for the x coordinates of refs in the would-be second column
    xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
                  in destinations
                  if _destination_position(pdf, dest)[1] == 1)
    xpos_count = {}
    for xpos in xpositions:
        xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
        if xpos_count[xpos] >= cutoff:
            return True
    return False

Example #16

Source File: pdf2pdfocr.py From pdf2pdfocr with Apache License 2.0

6 votes

def join_ocred_pdf(self):
        # Join PDF files into one file that contains all OCR "backgrounds"
        text_pdf_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, "pdf")))
        self.debug("We have {0} ocr'ed files".format(len(text_pdf_file_list)))
        if len(text_pdf_file_list) > 0:
            pdf_merger = PyPDF2.PdfFileMerger()
            for text_pdf_file in text_pdf_file_list:
                pdf_merger.append(PyPDF2.PdfFileReader(text_pdf_file, strict=False))
            pdf_merger.write(self.tmp_dir + self.prefix + "-ocr.pdf")
            pdf_merger.close()
        else:
            eprint("No PDF files generated after OCR. This is not expected. Aborting.")
            self.cleanup()
            exit(1)
        #
        self.debug("Joined ocr'ed PDF files")

Example #17

Source File: Parser.py From ioc_parser with MIT License

6 votes

def parse_pdf_pypdf2(self, f, fpath):
		try:
			pdf = PdfFileReader(f, strict = False)

			if self.dedup:
				self.dedup_store = set()

			self.handler.print_header(fpath)
			page_num = 0
			for page in pdf.pages:
				page_num += 1

				data = page.extractText()

				self.parse_page(fpath, data, page_num)
			self.handler.print_footer(fpath)
		except (KeyboardInterrupt, SystemExit):
			raise

Example #18

Source File: image.py From knowledge-repo with Apache License 2.0

6 votes

def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """

    check_dependencies(__optional_dependencies__['pdf'])
    # Import libraries within this function so as to avoid import-time dependence
    import PyPDF2
    from wand.image import Image  # TODO: When we start using this again, document which system-level libraries are required.

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file=pdf_bytes, resolution=resolution)
    img.convert("png")

    return img

Example #19

Source File: RastLeak_1_2.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)
	#print metadata_files
####### FUNCTION AnalyzeMetadata doc ######

Example #20

Source File: passwordBreaker.py From automate-the-boring-stuff-projects with MIT License

6 votes

def breakPassword(filename):
    """Breaks a single word password of a PDF
    Args:
        filename (str): Filename for encrypted pdf
    Returns:
        None
    """
    encryptedFile = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(encryptedFile)

    with open('dictionary.txt') as words:
        wordList = words.read().split('\n')

    for word in wordList:
        wordLower = word.lower()
        wordCap = word.capitalize()

        if pdfReader.decrypt(word):
            return word
        elif pdfReader.decrypt(wordCap):
            return wordCap
        elif pdfReader.decrypt(wordLower):
            return wordLower

    return

Example #21

Source File: downloadfiles.py From RastLeak with GNU General Public License v3.0

6 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		value=(metadata[meta])
		print ' - ' + meta + ':' + metadata[meta]
		if meta == "/Author":
			if value not in meta_author_array:
				meta_author_array.append(value)
		elif meta =="/Producer":
			if value not in meta_producer_array:
				meta_producer_array.append(value)
		elif meta == "/Creator":
			if value not in meta_creator_array:
				meta_creator_array.append(value)
	#Group the different arrays in one with all metadata
	metadata_files.append(meta_author_array)
	metadata_files.append(meta_producer_array)
	metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######

Example #22

Source File: watermarking_pdf.py From Python-Automation-Cookbook with MIT License

6 votes

def encrypt(out_pdf, password):
    print('Encrypting the document')

    output_pdf = PyPDF2.PdfFileWriter()

    in_file = open(out_pdf, "rb")
    input_pdf = PyPDF2.PdfFileReader(in_file)
    output_pdf.appendPagesFromReader(input_pdf)
    output_pdf.encrypt(password)

    # Intermediate file
    with open(INTERMEDIATE_ENCRYPT_FILE, "wb") as out_file:
        output_pdf.write(out_file)

    in_file.close()

    # Rename the intermediate file
    os.rename(INTERMEDIATE_ENCRYPT_FILE, out_pdf)

Example #23

Source File: test_pdf.py From callisto-core with GNU Affero General Public License v3.0

5 votes

def test_text_present(self):
        pdf = PDFUserReviewReport.generate({})
        pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf))
        self.assertTrue(pdf_reader.getPage(0).extractText())

Example #24

Source File: TextFileReader.py From UniqueBible with GNU General Public License v3.0

5 votes

def readPdfFile(self, fileName):
        try:
            pdfObject = open(fileName, "rb")
            pdfReader = PyPDF2.PdfFileReader(pdfObject)
            text = "\n\n".join([pdfReader.getPage(pageNum).extractText() for pageNum in range(0, pdfReader.numPages)])
            pdfObject.close()
            return text
        except:
            return self.errorReadingFile(fileName)

Example #25

Source File: label.py From c3bottles with MIT License

5 votes

def all_labels():
    output = PdfFileWriter()
    for dp in DropPoint.query.filter(DropPoint.removed == None).all():  # noqa
        output.addPage(PdfFileReader(BytesIO(_create_pdf(dp))).getPage(0))
    f = BytesIO()
    output.write(f)
    return Response(
        f.getvalue(),
        mimetype="application/pdf"
    )

Example #26

Source File: RastLeak_1_1.py From RastLeak with GNU General Public License v3.0

5 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		print ' - ' + meta + ':' + metadata[meta]
####### FUNCTION AnalyzeMetadata doc ######

Example #27

Source File: RastLeak_1_0.py From RastLeak with GNU General Public License v3.0

5 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		print ' - ' + meta + ':' + metadata[meta]
####### FUNCTION AnalyzeMetadata doc ######

Example #28

Source File: RastLeak_1_1.py From RastLeak with GNU General Public License v3.0

5 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		print ' - ' + meta + ':' + metadata[meta]
####### FUNCTION AnalyzeMetadata doc ######

Example #29

Source File: RastLeak_1_0.py From RastLeak with GNU General Public License v3.0

5 votes

def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
	pdfFile = PdfFileReader(file(filename, 'rb'))
	metadata = pdfFile.getDocumentInfo()
	print ' - Document: ' + str(filename)
	for meta in metadata:
		print ' - ' + meta + ':' + metadata[meta]
####### FUNCTION AnalyzeMetadata doc ######

Example #30

Source File: main_pdfCropMargins.py From pdfCropMargins with GNU General Public License v3.0

5 votes

def get_full_page_box_list_assigning_media_and_crop(input_doc, quiet=False,
                                                    skip_pre_crop=False):
    """Get a list of all the full-page box values for each page.  The argument
    input_doc should be a `PdfFileReader` object.  The boxes on the list are in the
    simple 4-float list format used by this program, not `RectangleObject` format."""

    full_page_box_list = []
    rotation_list = []

    if args.verbose and not quiet:
        print("\nOriginal full page sizes, in PDF format (lbrt):")

    for page_num in range(input_doc.getNumPages()):

        # Get the current page and find the full-page box.
        curr_page = input_doc.getPage(page_num)
        full_page_box = get_full_page_box_assigning_media_and_crop(curr_page,
                                                                   skip_pre_crop)

        if args.verbose and not quiet:
            # want to display page num numbering from 1, so add one
            print("\t"+str(page_num+1), "  rot =",
                  curr_page.rotationAngle, "\t", full_page_box)

        # Convert the RectangleObject to floats in an ordinary list and append.
        ordinary_box = [float(b) for b in full_page_box]
        full_page_box_list.append(ordinary_box)

        # Append the rotation value to the rotation_list.
        rotation_list.append(curr_page.rotationAngle)

    return full_page_box_list, rotation_list