Python PyPDF2.PdfFileReader() Examples
The following are 30
code examples of PyPDF2.PdfFileReader().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
PyPDF2
, or try the search function
.
Example #1
Source File: uploader.py From calibre-web with GNU General Public License v3.0 | 9 votes |
def pdf_meta(tmp_file_path, original_file_name, original_file_extension): doc_info = None if use_pdf_meta: doc_info = PdfFileReader(open(tmp_file_path, 'rb')).getDocumentInfo() if doc_info: author = doc_info.author if doc_info.author else u'Unknown' title = doc_info.title if doc_info.title else original_file_name subject = doc_info.subject else: author = u'Unknown' title = original_file_name subject = "" return BookMeta( file_path=tmp_file_path, extension=original_file_extension, title=title, author=author, cover=pdf_preview(tmp_file_path, original_file_name), description=subject, tags="", series="", series_id="", languages="")
Example #2
Source File: action.py From insightconnect-plugins with MIT License | 8 votes |
def run(self, params={}): try: if params.get('contents'): pdfFile = base64.b64decode(params.get('contents')) else: raise Exception("File contents missing!") except Exception as e: self.logger.error("File contents missing: ", e) raise try: with open("temp.pdf", 'wb') as temp_pdf: temp_pdf.write(pdfFile) pdfReader = PyPDF2.PdfFileReader(open('temp.pdf', 'rb')) pdftext = "" for page in range(pdfReader.numPages): pageObj = pdfReader.getPage(page) pdftext += pageObj.extractText().replace('\n','') except Exception as e: self.logger.info("An error occurred while extracting text: ", e) raise return {"output": pdftext}
Example #3
Source File: utils.py From open-syllabus-project with Apache License 2.0 | 8 votes |
def pdf_date(path): """ Extract a date from PDF file metadata. Args: path (str): The file path. Returns: datetime: The created date. """ reader = PdfFileReader(path) # Get rid of `D:` prefix and timezone. stamp = reader.documentInfo['/CreationDate'] match = re.search('\d+', stamp) return datetime.strptime( match.group(), '%Y%m%d%H%M%S' )
Example #4
Source File: pdfParanoia.py From automate-the-boring-stuff-projects with MIT License | 7 votes |
def encryptPDFs(root, password): """Encrypts all pdfs folder walk Args: root (str): folder path to walk password (str): password to encrypt pdfs with Returns: None """ for folder, subfolder, fileList in os.walk(root): for file in fileList: if file.endswith('.pdf'): filepath = os.path.join(os.path.abspath(folder), file) pdfFileObj = open(filepath, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) if not pdfReader.isEncrypted: pdfWriter = PyPDF2.PdfFileWriter() for pageNum in range(pdfReader.numPages): pdfWriter.addPage(pdfReader.getPage(pageNum)) pdfWriter.encrypt(password) newPath = os.path.dirname(filepath) + '/untitled folder/' + \ ('_encrypted.'.join(os.path.basename(filepath).split('.'))) resultPdf = open(newPath, 'wb') pdfWriter.write(resultPdf) resultPdf.close()
Example #5
Source File: RastLeak_1_2.py From RastLeak with GNU General Public License v3.0 | 7 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) #print metadata_files ####### FUNCTION AnalyzeMetadata doc ######
Example #6
Source File: testpdf.py From opencanary with BSD 3-Clause "New" or "Revised" License | 6 votes |
def createPDF(self, name=None, size='10kb'): from PyPDF2 import PdfFileReader, PdfFileWriter from fpdf import FPDF import os import random name = os.path.basename(name) tmp_name = '/tmp/' + name output_name = self.sharepath + '/' + name if size == '10kb': randlength = random.randint(10000,90000) elif size == '100kb': randlength = random.randint(100000,900000) elif size == '1mb': randlength = random.randint(1000000,9000000) #create file pdf=FPDF() pdf.add_page() pdf.set_font('Arial','B',8) pdf.cell(0,0,os.urandom(randlength)) pdf.output(tmp_name, "F") #encrypt it output = PdfFileWriter() input1 = PdfFileReader(open(tmp_name, "rb")) output.encrypt(user_pwd="ihasapass") output.addPage(input1.getPage(0)) outputStream = file(output_name, "wb") output.write(outputStream) outputStream.close()
Example #7
Source File: grepper.py From dftimewolf with Apache License 2.0 | 6 votes |
def GrepPDF(self, path): """Parses a PDF files text content for keywords. Args: path (str): PDF file path. Returns: set[str]: unique occurrences of every match. """ with open(path, 'rb') as pdf_file_obj: matches = set() text = '' pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj) pages = pdf_reader.numPages for page in range(pages): page_obj = pdf_reader.getPage(page) text += '\n' + page_obj.extractText() matches.update(set(x.lower() for x in re.findall( self._keywords, text, re.IGNORECASE))) return matches
Example #8
Source File: rastleak_1_4.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
Example #9
Source File: RastLeak_1_3.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) #print metadata_files ####### FUNCTION AnalyzeMetadata doc ######
Example #10
Source File: downloadfiles.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
Example #11
Source File: rastleak_1_4.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
Example #12
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def merge_pdf(input_folder, output_file): pdf2merge = [] for filename in os.listdir(input_folder): #print(filename) if filename.endswith('.pdf'): pdf2merge.append(filename) pdf2merge.sort() pdfWriter = PyPDF2.PdfFileWriter() for filename in pdf2merge: pdfFileObj = open(input_folder+"/"+filename,'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for pageNum in range(pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutput = open(output_file+'.pdf', 'wb') pdfWriter.write(pdfOutput) #Outputting the PDF pdfOutput.close()
Example #13
Source File: subroutines.py From SigProfilerExtractor with BSD 2-Clause "Simplified" License | 6 votes |
def merge_pdf(input_folder, output_file): pdf2merge = [] for filename in os.listdir(input_folder): #print(filename) if filename.endswith('.pdf'): pdf2merge.append(filename) pdf2merge.sort() pdfWriter = PyPDF2.PdfFileWriter() for filename in pdf2merge: pdfFileObj = open(input_folder+"/"+filename,'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) for pageNum in range(pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutput = open(output_file+'.pdf', 'wb') pdfWriter.write(pdfOutput) #Outputting the PDF pdfOutput.close()
Example #14
Source File: rastleak_2_0.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
Example #15
Source File: pdf.py From refextract with GNU General Public License v2.0 | 6 votes |
def _destinations_in_two_columns(pdf, destinations, cutoff=3): """ Check if the named destinations are organized along two columns (heuristic) @param pdf: a PdfFileReader object @param destinations: 'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the would-be second column start at the same position, return True """ # iterator for the x coordinates of refs in the would-be second column xpositions = (_destination_position(pdf, dest)[3] for (_, dest) in destinations if _destination_position(pdf, dest)[1] == 1) xpos_count = {} for xpos in xpositions: xpos_count[xpos] = xpos_count.get(xpos, 0) + 1 if xpos_count[xpos] >= cutoff: return True return False
Example #16
Source File: pdf2pdfocr.py From pdf2pdfocr with Apache License 2.0 | 6 votes |
def join_ocred_pdf(self): # Join PDF files into one file that contains all OCR "backgrounds" text_pdf_file_list = sorted(glob.glob(self.tmp_dir + "{0}*.{1}".format(self.prefix, "pdf"))) self.debug("We have {0} ocr'ed files".format(len(text_pdf_file_list))) if len(text_pdf_file_list) > 0: pdf_merger = PyPDF2.PdfFileMerger() for text_pdf_file in text_pdf_file_list: pdf_merger.append(PyPDF2.PdfFileReader(text_pdf_file, strict=False)) pdf_merger.write(self.tmp_dir + self.prefix + "-ocr.pdf") pdf_merger.close() else: eprint("No PDF files generated after OCR. This is not expected. Aborting.") self.cleanup() exit(1) # self.debug("Joined ocr'ed PDF files")
Example #17
Source File: Parser.py From ioc_parser with MIT License | 6 votes |
def parse_pdf_pypdf2(self, f, fpath): try: pdf = PdfFileReader(f, strict = False) if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in pdf.pages: page_num += 1 data = page.extractText() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise
Example #18
Source File: image.py From knowledge-repo with Apache License 2.0 | 6 votes |
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154): """ Returns specified PDF page as wand.image.Image png. :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages. :param int pagenum: Page number to take. :param int resolution: Resolution for resulting png in DPI. """ check_dependencies(__optional_dependencies__['pdf']) # Import libraries within this function so as to avoid import-time dependence import PyPDF2 from wand.image import Image # TODO: When we start using this again, document which system-level libraries are required. dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = Image(file=pdf_bytes, resolution=resolution) img.convert("png") return img
Example #19
Source File: RastLeak_1_2.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) #print metadata_files ####### FUNCTION AnalyzeMetadata doc ######
Example #20
Source File: passwordBreaker.py From automate-the-boring-stuff-projects with MIT License | 6 votes |
def breakPassword(filename): """Breaks a single word password of a PDF Args: filename (str): Filename for encrypted pdf Returns: None """ encryptedFile = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(encryptedFile) with open('dictionary.txt') as words: wordList = words.read().split('\n') for word in wordList: wordLower = word.lower() wordCap = word.capitalize() if pdfReader.decrypt(word): return word elif pdfReader.decrypt(wordCap): return wordCap elif pdfReader.decrypt(wordLower): return wordLower return
Example #21
Source File: downloadfiles.py From RastLeak with GNU General Public License v3.0 | 6 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
Example #22
Source File: watermarking_pdf.py From Python-Automation-Cookbook with MIT License | 6 votes |
def encrypt(out_pdf, password): print('Encrypting the document') output_pdf = PyPDF2.PdfFileWriter() in_file = open(out_pdf, "rb") input_pdf = PyPDF2.PdfFileReader(in_file) output_pdf.appendPagesFromReader(input_pdf) output_pdf.encrypt(password) # Intermediate file with open(INTERMEDIATE_ENCRYPT_FILE, "wb") as out_file: output_pdf.write(out_file) in_file.close() # Rename the intermediate file os.rename(INTERMEDIATE_ENCRYPT_FILE, out_pdf)
Example #23
Source File: test_pdf.py From callisto-core with GNU Affero General Public License v3.0 | 5 votes |
def test_text_present(self): pdf = PDFUserReviewReport.generate({}) pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf)) self.assertTrue(pdf_reader.getPage(0).extractText())
Example #24
Source File: TextFileReader.py From UniqueBible with GNU General Public License v3.0 | 5 votes |
def readPdfFile(self, fileName): try: pdfObject = open(fileName, "rb") pdfReader = PyPDF2.PdfFileReader(pdfObject) text = "\n\n".join([pdfReader.getPage(pageNum).extractText() for pageNum in range(0, pdfReader.numPages)]) pdfObject.close() return text except: return self.errorReadingFile(fileName)
Example #25
Source File: label.py From c3bottles with MIT License | 5 votes |
def all_labels(): output = PdfFileWriter() for dp in DropPoint.query.filter(DropPoint.removed == None).all(): # noqa output.addPage(PdfFileReader(BytesIO(_create_pdf(dp))).getPage(0)) f = BytesIO() output.write(f) return Response( f.getvalue(), mimetype="application/pdf" )
Example #26
Source File: RastLeak_1_1.py From RastLeak with GNU General Public License v3.0 | 5 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: print ' - ' + meta + ':' + metadata[meta] ####### FUNCTION AnalyzeMetadata doc ######
Example #27
Source File: RastLeak_1_0.py From RastLeak with GNU General Public License v3.0 | 5 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: print ' - ' + meta + ':' + metadata[meta] ####### FUNCTION AnalyzeMetadata doc ######
Example #28
Source File: RastLeak_1_1.py From RastLeak with GNU General Public License v3.0 | 5 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: print ' - ' + meta + ':' + metadata[meta] ####### FUNCTION AnalyzeMetadata doc ######
Example #29
Source File: RastLeak_1_0.py From RastLeak with GNU General Public License v3.0 | 5 votes |
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: print ' - ' + meta + ':' + metadata[meta] ####### FUNCTION AnalyzeMetadata doc ######
Example #30
Source File: main_pdfCropMargins.py From pdfCropMargins with GNU General Public License v3.0 | 5 votes |
def get_full_page_box_list_assigning_media_and_crop(input_doc, quiet=False, skip_pre_crop=False): """Get a list of all the full-page box values for each page. The argument input_doc should be a `PdfFileReader` object. The boxes on the list are in the simple 4-float list format used by this program, not `RectangleObject` format.""" full_page_box_list = [] rotation_list = [] if args.verbose and not quiet: print("\nOriginal full page sizes, in PDF format (lbrt):") for page_num in range(input_doc.getNumPages()): # Get the current page and find the full-page box. curr_page = input_doc.getPage(page_num) full_page_box = get_full_page_box_assigning_media_and_crop(curr_page, skip_pre_crop) if args.verbose and not quiet: # want to display page num numbering from 1, so add one print("\t"+str(page_num+1), " rot =", curr_page.rotationAngle, "\t", full_page_box) # Convert the RectangleObject to floats in an ordinary list and append. ordinary_box = [float(b) for b in full_page_box] full_page_box_list.append(ordinary_box) # Append the rotation value to the rotation_list. rotation_list.append(curr_page.rotationAngle) return full_page_box_list, rotation_list