Python pdfminer.pdfpage.PDFPage.create_pages() Examples

The following are 10 code examples of pdfminer.pdfpage.PDFPage.create_pages(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.pdfpage.PDFPage , or try the search function .
Example #1
Source File: pdftk.py    From docassemble with MIT License 6 votes vote down vote up
def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter) 
Example #2
Source File: pdf.py    From pdfplumber with MIT License 5 votes vote down vote up
def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            page_number = i+1
            if pp != None and page_number not in pp: continue
            p = Page(self, page, page_number=page_number, initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages 
Example #3
Source File: pdf_utils.py    From pdftotree with MIT License 5 votes vote down vote up
def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout 
Example #4
Source File: pdf.py    From PassportEye with MIT License 5 votes vote down vote up
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None 
Example #5
Source File: parse_pdf.py    From GraphIE with GNU General Public License v3.0 5 votes vote down vote up
def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None 
Example #6
Source File: pdfjinja.py    From pdfjinja with MIT License 5 votes vote down vote up
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page) 
Example #7
Source File: parser.py    From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def extract(self, max_page_num=None):
        for page in PDFPage.create_pages(self._document):
            self._interpreter.process_page(page)
            layout = self._device.get_result()

            if max_page_num != None and layout.pageid > max_page_num:
                break

            self._pages[layout.pageid] = layout 
Example #8
Source File: formatFun.py    From China_stock_announcement with MIT License 5 votes vote down vote up
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc 
Example #9
Source File: pdftitle.py    From pdftitle with GNU General Public License v3.0 4 votes vote down vote up
def get_title_from_io(pdf_io):
    # pylint: disable=too-many-locals
    parser = PDFParser(pdf_io)
    # if pdf is protected with a pwd, 2nd param here is password
    doc = PDFDocument(parser)

    # pdf may not allow extraction
    # pylint: disable=no-else-return
    if doc.is_extractable:
        rm = PDFResourceManager()
        dev = TextOnlyDevice(rm)
        interpreter = TextOnlyInterpreter(rm, dev)

        first_page = StringIO()
        converter = TextConverter(rm, first_page, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(rm, converter)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_interpreter.process_page(page)
            break

        converter.close()
        first_page_text = first_page.getvalue()
        first_page.close()
        dev.recover_last_paragraph()
        verbose('all blocks')

        for b in dev.blocks:
            verbose(b)

        # find max font size
        max_tfs = max(dev.blocks, key=lambda x: x[1])[1]
        verbose('max_tfs: ', max_tfs)
        # find max blocks with max font size
        max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks))
        # find the one with the highest y coordinate
        # this is the most close to top
        max_y = max(max_blocks, key=lambda x: x[3])[3]
        verbose('max_y: ', max_y)
        found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks))
        verbose('found blocks')

        for b in found_blocks:
            verbose(b)
        block = found_blocks[0]
        title = ''.join(block[4]).strip()

        # Retrieve missing spaces if needed
        if " " not in title:
            title = retrieve_spaces(first_page_text, title)

        # Remove duplcate spaces if any are present
        if "  " in title:
            title = " ".join(title.split())

        return title
    else:
        return None 
Example #10
Source File: extracthl.py    From Menotexport with GNU General Public License v3.0 4 votes vote down vote up
def extractHighlights(filename,anno,verbose=True):
    '''Extract highlighted texts from a PDF

    '''
    hlpages=anno.hlpages
    if len(hlpages)==0:
        return []

    #--------------Get pdfmine instances--------------
    document, interpreter, device=init(filename)

    #----------------Loop through pages----------------
    hltexts=[]

    for ii,page in enumerate(PDFPage.create_pages(document)):

        #------------Get highlights in page------------
        if len(hlpages)>0 and ii+1 in hlpages:

            anno_total=len(anno.highlights[ii+1])
            anno_found=0

            interpreter.process_page(page)
            layout = device.get_result()

            #--------------Sort boxes diagnoally--------------
            objs=sortDiag(layout)

            #-----------------Refine ordering-----------------
            objs=fineTuneOrder(objs)

            #----------------Loop through boxes----------------
            for jj,objj in enumerate(objs):

                if type(objj)!=LTTextBox and\
                        type(objj)!=LTTextBoxHorizontal:
                    continue
                textjj,numjj=findStrFromBox(anno.highlights[ii+1],objj)

                if numjj>0:
                    #--------------Attach text with meta--------------
                    textjj=Anno(textjj,\
                        ctime=getCtime(anno.highlights[ii+1]),\
                        title=anno.meta['title'],\
                        page=ii+1,citationkey=anno.meta['citationkey'],\
                        tags=anno.meta['tags'])

                    hltexts.append(textjj)

                #----------------Break if all found----------------
                anno_found+=numjj
                if anno_total==anno_found:
                    break


    return hltexts