Python pdfminer.pdfdocument.PDFDocument() Examples
The following are 18
code examples of pdfminer.pdfdocument.PDFDocument().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.pdfdocument
, or try the search function
.

Example #1
Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License | 6 votes |
def process_pdf(cls, pdf, output, verbose=False, tables=None): parser = pdfparser.PDFParser(pdf) document = pdfdocument.PDFDocument(parser) rsrcmgr = pdfinterp.PDFResourceManager(caching=True) params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2, line_overlap=0.4, boxes_flow=0.5) device = converter.PDFPageAggregator(rsrcmgr, laparams=params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) outlines = document.get_outlines() registers = {} pages = dict((pageno, page) for (pageno, page) in enumerate(pdfpage.PDFPage.create_pages(document))) for xref in document.xrefs: for oid in xref.get_objids(): obj = document.getobj(oid) if type(obj) == dict: if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']: pageoid = obj['A'].resolve()['D'][0].objid (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems() if p.pageid == pageoid][0] cls.process_table_index(parser, document, rsrcmgr, params, device, interpreter, pages, page, pageno, output, verbose, tables) return
Example #2
Source File: pdftk.py From docassemble with MIT License | 6 votes |
def read_fields(pdffile): import string printable = set(string.printable) outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 if 'AcroForm' not in doc.catalog: return None fields = resolve1(doc.catalog['AcroForm'])['Fields'] recursively_add_fields(fields, id_to_page, outfields) return sorted(outfields, key=fieldsorter)
Example #3
Source File: pdf.py From pdfplumber with MIT License | 5 votes |
def __init__(self, stream, pages = None, laparams = None, precision = 0.001, password = "" ): self.laparams = None if laparams == None else LAParams(**laparams) self.stream = stream self.pages_to_parse = pages self.precision = precision rsrcmgr = PDFResourceManager() self.doc = PDFDocument(PDFParser(stream), password = password) self.metadata = {} for info in self.doc.info: self.metadata.update(info) for k, v in self.metadata.items(): if hasattr(v, "resolve"): v = v.resolve() if type(v) == list: self.metadata[k] = list(map(decode_text, v)) elif isinstance(v, PSLiteral): self.metadata[k] = decode_text(v.name) elif isinstance(v, bool): self.metadata[k] = v else: self.metadata[k] = decode_text(v) self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams) self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
Example #4
Source File: pdf_utils.py From pdftotree with MIT License | 5 votes |
def analyze_pages(file_name, char_margin=1.0): """ Input: the file path to the PDF file Output: yields the layout object for each page in the PDF """ log = logging.getLogger(__name__) # Open a PDF file. with open(os.path.realpath(file_name), "rb") as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password="") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams( char_margin=char_margin, word_margin=0.1, detect_vertical=True ) # Create a PDF page aggregator object. device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page_num, page in enumerate(PDFPage.create_pages(document)): try: interpreter.process_page(page) except OverflowError as oe: log.exception( "{}, skipping page {} of {}".format(oe, page_num, file_name) ) continue layout = device.get_result() yield layout
Example #5
Source File: pdf.py From PassportEye with MIT License | 5 votes |
def extract_first_jpeg_in_pdf(fstream): """ Reads a given PDF file and scans for the first valid embedded JPEG image. Returns either None (if none found) or a string of data for the image. There is no 100% guarantee for this code, yet it seems to work fine with most scanner-produced images around. More testing might be needed though. Note that in principle there is no serious problem extracting PNGs or other image types from PDFs, however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable for PDFMiner. :param fstream: Readable binary stream of the PDF :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed. """ parser = PDFParser(fstream) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) for page in pages: interpreter.process_page(page) layout = device.result for el in layout: if isinstance(el, LTFigure): for im in el: if isinstance(im, LTImage): # Found one! st = None try: imdata = im.stream.get_data() except: # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well... imdata = im.stream.get_rawdata() if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'): return imdata return None
Example #6
Source File: parse_pdf.py From GraphIE with GNU General Public License v3.0 | 5 votes |
def parse_case(case_path): """Parse all the pdf files in the folder.""" try: result = { 'id': case_path.split('/')[-2], 'docs': {} } for name in os.listdir(case_path): if name[0] == '.' or name[-4:] != '.pdf': continue doc_id = name.split('.')[0] result['docs'][doc_id] = {'pages': {}} doc_obj = result['docs'][doc_id] path = case_path + name fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(detect_vertical=True, all_texts=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() doc_obj['pages'][layout.pageid] = { 'size': (layout.width, layout.height), 'text': parse_text(layout) } # print(layout.width, layout.height) output = open(case_path + 'parsed.json', 'w') json.dump(result, output, indent=None) except: print("Error " + case_path) return None
Example #7
Source File: scihub_api.py From SciHubEVA with MIT License | 5 votes |
def get_pdf_metadata(self, pdf): temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(pdf) metadata = {'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR'} pdf_parser = PDFParser(temp_pdf_file) try: pdf_doc = PDFDocument(pdf_parser) pdf_metadata = pdf_doc.info[0] author = make_pdf_metadata_str(pdf_metadata.get('Author', '')) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_metadata.get('Title', '')) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', ''))) if year and year != '': metadata['year'] = year except Exception as e: pass temp_pdf_file.close() return metadata
Example #8
Source File: pdfjinja.py From pdfjinja with MIT License | 5 votes |
def parse_pdf(self, fp): parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) page.annots and self.parse_annotations(pgnum, page)
Example #9
Source File: parser.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _read_file(self, filename): parser = PDFParser(open(filename, 'rb')) document = PDFDocument(parser) return document
Example #10
Source File: extractFillableFields.py From opentaxforms with GNU Affero General Public License v3.0 | 5 votes |
def xmlFromPdf(pdfpath, xmlpath=None): '''find xfa data in pdf file''' with open(pdfpath, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) all_objids = set(objid for xref in doc.xrefs for objid in xref.get_objids()) for objid in all_objids: obj = doc.getobj(objid) if not isinstance(obj, PDFStream): continue try: data = obj.get_data() except PDFNotImplementedError: # eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode continue if b'xfa-template' in data: break else: msg='Cannot find form data in %s' % pdfpath raise CrypticXml(msg) # data == <form>-text.xml tree = etree.fromstring(data) if xmlpath is not None: with open(xmlpath, 'wb') as out: out.write(etree.tostring(tree, pretty_print=True)) return tree
Example #11
Source File: formatFun.py From China_stock_announcement with MIT License | 5 votes |
def p2t(sourcefile, outfile): with open(sourcefile, 'rb') as fp: # 来创建一个pdf文档分析器 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 try: document = PDFDocument(parser) except: print(sourcefile + ' :pdf未正确下载') # 检查文件是否允许文本提取 else: if not document.is_extractable: print(sourcefile + ' :不允许提取文本') # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager() # 设定参数进行分析 laparams=LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams) # 创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout=device.get_result() for x in layout: if(isinstance(x,LTTextBoxHorizontal)): with open(outfile, 'a') as f: f.write(x.get_text().encode('utf-8')+'\n') print(sourcefile + ' 已转为 ' + outfile) ##############################################把doc转为txt############################################## # 调用之前要确保你在linux 下装了catdoc
Example #12
Source File: extract.py From x86doc with The Unlicense | 5 votes |
def main(argv): for arg in argv[1:]: fd = open(arg) parser = PDFParser(fd) document = PDFDocument(parser) if not document.is_extractable: print "Document not extractable." return 1 params = LAParams(char_margin=1) resMan = PDFResourceManager(caching=True) device = PDFPageAggregator(resMan, laparams=params) interpreter = PDFPageInterpreter(resMan, device) parser = x86ManParser("html", params) i = 1 for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True): print "Processing page %i" % i interpreter.process_page(page) page = device.get_result() parser.process_page(page) i += 1 parser.flush() fd.close() print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail)
Example #13
Source File: outline.py From xlinkBook with MIT License | 5 votes |
def getToc(self, pdfPath): infile = open(pdfPath, 'rb') parser = PDFParser(infile) document = PDFDocument(parser) toc = list() for (level,title,dest,a,structelem) in document.get_outlines(): toc.append((level, title)) return toc
Example #14
Source File: extracthl2.py From Menotexport with GNU General Public License v3.0 | 5 votes |
def init(filename,verbose=True): '''Initiate analysis objs ''' fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return document, interpreter, device #----------------Get the latest creation time of annos----------------
Example #15
Source File: extracthl.py From Menotexport with GNU General Public License v3.0 | 5 votes |
def init(filename,verbose=True): '''Initiate analysis objs ''' fp = open(filename, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return document, interpreter, device #----------------Get the latest creation time of annos----------------
Example #16
Source File: pdfsheet.py From avrae with GNU General Public License v3.0 | 4 votes |
def main(): fn = input("PDF filename: ") character = {} with open(fn, mode='rb') as f: parser = PDFParser(f) doc = PDFDocument(parser) try: fields = resolve1(doc.catalog['AcroForm']) fields = resolve1(fields['Fields']) except: raise Exception('This is not a form-fillable character sheet!') for i in fields: field = resolve1(i) name, value = field.get('T'), field.get('V') if isinstance(value, PSLiteral): value = value.name elif value is not None: try: value = value.decode('iso-8859-1').strip() except: pass character[name.decode('iso-8859-1').strip()] = value print(character) with open('./output/pdfsheet-test.json', mode='w') as f: json.dump(character, f, skipkeys=True, sort_keys=True, indent=4)
Example #17
Source File: data_mine.py From Bluto with GNU General Public License v3.0 | 4 votes |
def pdf_read(pdf_file_list): info('Extracting PDF MetaData') software_list = [] user_names = [] for filename in pdf_file_list: info(filename) try: fp = open(filename, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) software = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Creator']) person = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Author']) if person: oddity = re.match('(\s\w\s+(\w\s+)+\w)', person) if oddity: oddity = str(oddity.group(1)).replace(' ', '') user_names.append(str(oddity).title()) else: user_names.append(str(person).title()) if software: oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software) if oddity2: oddity2 = str(oddity2.group(1)).replace(' ', '') software_list.append(oddity2) else: software_list.append(software) except IndexError: continue except pdfminer.pdfparser.PDFSyntaxError: continue except KeyError: continue except TypeError: continue except Exception: info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE) continue info('Finished Extracting PDF MetaData') return (user_names, software_list) #Extract Author MS FILES
Example #18
Source File: pdftitle.py From pdftitle with GNU General Public License v3.0 | 4 votes |
def get_title_from_io(pdf_io): # pylint: disable=too-many-locals parser = PDFParser(pdf_io) # if pdf is protected with a pwd, 2nd param here is password doc = PDFDocument(parser) # pdf may not allow extraction # pylint: disable=no-else-return if doc.is_extractable: rm = PDFResourceManager() dev = TextOnlyDevice(rm) interpreter = TextOnlyInterpreter(rm, dev) first_page = StringIO() converter = TextConverter(rm, first_page, laparams=LAParams()) page_interpreter = PDFPageInterpreter(rm, converter) for page in PDFPage.create_pages(doc): interpreter.process_page(page) page_interpreter.process_page(page) break converter.close() first_page_text = first_page.getvalue() first_page.close() dev.recover_last_paragraph() verbose('all blocks') for b in dev.blocks: verbose(b) # find max font size max_tfs = max(dev.blocks, key=lambda x: x[1])[1] verbose('max_tfs: ', max_tfs) # find max blocks with max font size max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks)) # find the one with the highest y coordinate # this is the most close to top max_y = max(max_blocks, key=lambda x: x[3])[3] verbose('max_y: ', max_y) found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks)) verbose('found blocks') for b in found_blocks: verbose(b) block = found_blocks[0] title = ''.join(block[4]).strip() # Retrieve missing spaces if needed if " " not in title: title = retrieve_spaces(first_page_text, title) # Remove duplcate spaces if any are present if " " in title: title = " ".join(title.split()) return title else: return None