Python pdfminer.pdfpage.PDFPage.create_pages() Examples
The following are 10
code examples of pdfminer.pdfpage.PDFPage.create_pages().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pdfminer.pdfpage.PDFPage
, or try the search function
.
Example #1
Source File: pdftk.py From docassemble with MIT License | 6 votes |
def read_fields(pdffile): import string printable = set(string.printable) outfields = list() fp = open(pdffile, 'rb') id_to_page = dict() parser = PDFParser(fp) doc = PDFDocument(parser) pageno = 1; for page in PDFPage.create_pages(doc): id_to_page[page.pageid] = pageno pageno += 1 if 'AcroForm' not in doc.catalog: return None fields = resolve1(doc.catalog['AcroForm'])['Fields'] recursively_add_fields(fields, id_to_page, outfields) return sorted(outfields, key=fieldsorter)
Example #2
Source File: pdf.py From pdfplumber with MIT License | 5 votes |
def pages(self): if hasattr(self, "_pages"): return self._pages doctop = 0 pp = self.pages_to_parse self._pages = [] for i, page in enumerate(PDFPage.create_pages(self.doc)): page_number = i+1 if pp != None and page_number not in pp: continue p = Page(self, page, page_number=page_number, initial_doctop=doctop) self._pages.append(p) doctop += p.height return self._pages
Example #3
Source File: pdf_utils.py From pdftotree with MIT License | 5 votes |
def analyze_pages(file_name, char_margin=1.0): """ Input: the file path to the PDF file Output: yields the layout object for each page in the PDF """ log = logging.getLogger(__name__) # Open a PDF file. with open(os.path.realpath(file_name), "rb") as fp: # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password="") # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams( char_margin=char_margin, word_margin=0.1, detect_vertical=True ) # Create a PDF page aggregator object. device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page_num, page in enumerate(PDFPage.create_pages(document)): try: interpreter.process_page(page) except OverflowError as oe: log.exception( "{}, skipping page {} of {}".format(oe, page_num, file_name) ) continue layout = device.get_result() yield layout
Example #4
Source File: pdf.py From PassportEye with MIT License | 5 votes |
def extract_first_jpeg_in_pdf(fstream): """ Reads a given PDF file and scans for the first valid embedded JPEG image. Returns either None (if none found) or a string of data for the image. There is no 100% guarantee for this code, yet it seems to work fine with most scanner-produced images around. More testing might be needed though. Note that in principle there is no serious problem extracting PNGs or other image types from PDFs, however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable for PDFMiner. :param fstream: Readable binary stream of the PDF :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed. """ parser = PDFParser(fstream) document = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = PDFPageAggregator(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.create_pages(document) for page in pages: interpreter.process_page(page) layout = device.result for el in layout: if isinstance(el, LTFigure): for im in el: if isinstance(im, LTImage): # Found one! st = None try: imdata = im.stream.get_data() except: # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well... imdata = im.stream.get_rawdata() if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'): return imdata return None
Example #5
Source File: parse_pdf.py From GraphIE with GNU General Public License v3.0 | 5 votes |
def parse_case(case_path): """Parse all the pdf files in the folder.""" try: result = { 'id': case_path.split('/')[-2], 'docs': {} } for name in os.listdir(case_path): if name[0] == '.' or name[-4:] != '.pdf': continue doc_id = name.split('.')[0] result['docs'][doc_id] = {'pages': {}} doc_obj = result['docs'][doc_id] path = case_path + name fp = open(path, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams(detect_vertical=True, all_texts=True) device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() doc_obj['pages'][layout.pageid] = { 'size': (layout.width, layout.height), 'text': parse_text(layout) } # print(layout.width, layout.height) output = open(case_path + 'parsed.json', 'w') json.dump(result, output, indent=None) except: print("Error " + case_path) return None
Example #6
Source File: pdfjinja.py From pdfjinja with MIT License | 5 votes |
def parse_pdf(self, fp): parser = PDFParser(fp) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) device = PDFDevice(rsrcmgr) interpreter = PDFPageInterpreter(rsrcmgr, device) for pgnum, page in enumerate(PDFPage.create_pages(doc)): interpreter.process_page(page) page.annots and self.parse_annotations(pgnum, page)
Example #7
Source File: parser.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License | 5 votes |
def extract(self, max_page_num=None): for page in PDFPage.create_pages(self._document): self._interpreter.process_page(page) layout = self._device.get_result() if max_page_num != None and layout.pageid > max_page_num: break self._pages[layout.pageid] = layout
Example #8
Source File: formatFun.py From China_stock_announcement with MIT License | 5 votes |
def p2t(sourcefile, outfile): with open(sourcefile, 'rb') as fp: # 来创建一个pdf文档分析器 parser = PDFParser(fp) #创建一个PDF文档对象存储文档结构 try: document = PDFDocument(parser) except: print(sourcefile + ' :pdf未正确下载') # 检查文件是否允许文本提取 else: if not document.is_extractable: print(sourcefile + ' :不允许提取文本') # 创建一个PDF资源管理器对象来存储共赏资源 rsrcmgr=PDFResourceManager() # 设定参数进行分析 laparams=LAParams() # 创建一个PDF设备对象 # device=PDFDevice(rsrcmgr) device=PDFPageAggregator(rsrcmgr,laparams=laparams) # 创建一个PDF解释器对象 interpreter=PDFPageInterpreter(rsrcmgr,device) # 处理每一页 for page in PDFPage.create_pages(document): interpreter.process_page(page) # 接受该页面的LTPage对象 layout=device.get_result() for x in layout: if(isinstance(x,LTTextBoxHorizontal)): with open(outfile, 'a') as f: f.write(x.get_text().encode('utf-8')+'\n') print(sourcefile + ' 已转为 ' + outfile) ##############################################把doc转为txt############################################## # 调用之前要确保你在linux 下装了catdoc
Example #9
Source File: pdftitle.py From pdftitle with GNU General Public License v3.0 | 4 votes |
def get_title_from_io(pdf_io): # pylint: disable=too-many-locals parser = PDFParser(pdf_io) # if pdf is protected with a pwd, 2nd param here is password doc = PDFDocument(parser) # pdf may not allow extraction # pylint: disable=no-else-return if doc.is_extractable: rm = PDFResourceManager() dev = TextOnlyDevice(rm) interpreter = TextOnlyInterpreter(rm, dev) first_page = StringIO() converter = TextConverter(rm, first_page, laparams=LAParams()) page_interpreter = PDFPageInterpreter(rm, converter) for page in PDFPage.create_pages(doc): interpreter.process_page(page) page_interpreter.process_page(page) break converter.close() first_page_text = first_page.getvalue() first_page.close() dev.recover_last_paragraph() verbose('all blocks') for b in dev.blocks: verbose(b) # find max font size max_tfs = max(dev.blocks, key=lambda x: x[1])[1] verbose('max_tfs: ', max_tfs) # find max blocks with max font size max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks)) # find the one with the highest y coordinate # this is the most close to top max_y = max(max_blocks, key=lambda x: x[3])[3] verbose('max_y: ', max_y) found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks)) verbose('found blocks') for b in found_blocks: verbose(b) block = found_blocks[0] title = ''.join(block[4]).strip() # Retrieve missing spaces if needed if " " not in title: title = retrieve_spaces(first_page_text, title) # Remove duplcate spaces if any are present if " " in title: title = " ".join(title.split()) return title else: return None
Example #10
Source File: extracthl.py From Menotexport with GNU General Public License v3.0 | 4 votes |
def extractHighlights(filename,anno,verbose=True): '''Extract highlighted texts from a PDF ''' hlpages=anno.hlpages if len(hlpages)==0: return [] #--------------Get pdfmine instances-------------- document, interpreter, device=init(filename) #----------------Loop through pages---------------- hltexts=[] for ii,page in enumerate(PDFPage.create_pages(document)): #------------Get highlights in page------------ if len(hlpages)>0 and ii+1 in hlpages: anno_total=len(anno.highlights[ii+1]) anno_found=0 interpreter.process_page(page) layout = device.get_result() #--------------Sort boxes diagnoally-------------- objs=sortDiag(layout) #-----------------Refine ordering----------------- objs=fineTuneOrder(objs) #----------------Loop through boxes---------------- for jj,objj in enumerate(objs): if type(objj)!=LTTextBox and\ type(objj)!=LTTextBoxHorizontal: continue textjj,numjj=findStrFromBox(anno.highlights[ii+1],objj) if numjj>0: #--------------Attach text with meta-------------- textjj=Anno(textjj,\ ctime=getCtime(anno.highlights[ii+1]),\ title=anno.meta['title'],\ page=ii+1,citationkey=anno.meta['citationkey'],\ tags=anno.meta['tags']) hltexts.append(textjj) #----------------Break if all found---------------- anno_found+=numjj if anno_total==anno_found: break return hltexts