Python pdfminer.pdfdocument.PDFDocument() Examples

The following are 18 code examples of pdfminer.pdfdocument.PDFDocument(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.pdfdocument , or try the search function

Example #1

Source File: pdftk.py From docassemble with MIT License

6 votes

def read_fields(pdffile):
    import string
    printable = set(string.printable)
    outfields = list()
    fp = open(pdffile, 'rb')
    id_to_page = dict()
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    pageno = 1;
    for page in PDFPage.create_pages(doc):
        id_to_page[page.pageid] = pageno
        pageno += 1
    if 'AcroForm' not in doc.catalog:
        return None
    fields = resolve1(doc.catalog['AcroForm'])['Fields']
    recursively_add_fields(fields, id_to_page, outfields)
    return sorted(outfields, key=fieldsorter)

Example #2

Source File: parse_am37x_register_tables.py From bootloader_instrumentation_suite with MIT License

6 votes

def process_pdf(cls, pdf, output, verbose=False, tables=None):
        parser = pdfparser.PDFParser(pdf)
        document = pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfinterp.PDFResourceManager(caching=True)

        params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
                                 line_overlap=0.4, boxes_flow=0.5)
        device = converter.PDFPageAggregator(rsrcmgr, laparams=params)

        interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        outlines = document.get_outlines()
        registers = {}
        pages = dict((pageno, page) for (pageno, page)
                     in enumerate(pdfpage.PDFPage.create_pages(document)))
        for xref in document.xrefs:
            for oid in xref.get_objids():
                obj = document.getobj(oid)
                if type(obj) == dict:
                    if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
                        pageoid = obj['A'].resolve()['D'][0].objid
                        (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
                                          if p.pageid == pageoid][0]
                        cls.process_table_index(parser, document, rsrcmgr, params, device,
                                                interpreter, pages, page, pageno, output,
                                                verbose, tables)
                        return

Example #3

Source File: extractFillableFields.py From opentaxforms with GNU Affero General Public License v3.0

5 votes

def xmlFromPdf(pdfpath, xmlpath=None):
    '''find xfa data in pdf file'''
    with open(pdfpath, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        all_objids = set(objid for xref in doc.xrefs
                         for objid in xref.get_objids())
        for objid in all_objids:
            obj = doc.getobj(objid)
            if not isinstance(obj, PDFStream):
                continue
            try:
                data = obj.get_data()
            except PDFNotImplementedError:
                # eg for jpeg image: PDFNotImplementedError: Unsupported filter: /DCTDecode
                continue
            if b'xfa-template' in data:
                break
        else:
            msg='Cannot find form data in %s' % pdfpath
            raise CrypticXml(msg)
    # data == <form>-text.xml
    tree = etree.fromstring(data)
    if xmlpath is not None:
        with open(xmlpath, 'wb') as out:
            out.write(etree.tostring(tree, pretty_print=True))
    return tree

Example #4

Source File: extracthl.py From Menotexport with GNU General Public License v3.0

5 votes

def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device
    



#----------------Get the latest creation time of annos----------------

Example #5

Source File: extracthl2.py From Menotexport with GNU General Public License v3.0

5 votes

def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device


#----------------Get the latest creation time of annos----------------

Example #6

Source File: outline.py From xlinkBook with MIT License

5 votes

def getToc(self, pdfPath):
        infile = open(pdfPath, 'rb')
        parser = PDFParser(infile)
        document = PDFDocument(parser)

        toc = list()
        for (level,title,dest,a,structelem) in document.get_outlines():
            toc.append((level, title))

        return toc

Example #7

Source File: extract.py From x86doc with The Unlicense

5 votes

def main(argv):
	for arg in argv[1:]:
		fd = open(arg)
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print "Document not extractable."
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print "Processing page %i" % i
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()
	
		print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail)

Example #8

Source File: formatFun.py From China_stock_announcement with MIT License

5 votes

def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc

Example #9

Source File: pdf.py From pdfplumber with MIT License

5 votes

def __init__(self,
        stream,
        pages = None,
        laparams = None,
        precision = 0.001,
        password = ""
    ):
        self.laparams = None if laparams == None else LAParams(**laparams)
        self.stream = stream
        self.pages_to_parse = pages
        self.precision = precision
        rsrcmgr = PDFResourceManager()
        self.doc = PDFDocument(PDFParser(stream), password = password)
        self.metadata = {}
        for info in self.doc.info:
            self.metadata.update(info)
        for k, v in self.metadata.items():
            if hasattr(v, "resolve"):
                v = v.resolve()
            if type(v) == list:
                self.metadata[k] = list(map(decode_text, v))
            elif isinstance(v, PSLiteral):
                self.metadata[k] = decode_text(v.name)
            elif isinstance(v, bool):
                self.metadata[k] = v
            else:
                self.metadata[k] = decode_text(v)
        self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

Example #10

Source File: parser.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License

5 votes

def _read_file(self, filename):
        parser = PDFParser(open(filename, 'rb'))
        document = PDFDocument(parser)
        return document

Example #11

Source File: pdfjinja.py From pdfjinja with MIT License

5 votes

def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page)

Example #12

Source File: scihub_api.py From SciHubEVA with MIT License

5 votes

def get_pdf_metadata(self, pdf):
        temp_pdf_file = tempfile.TemporaryFile()
        temp_pdf_file.write(pdf)

        metadata = {'author': 'UNKNOWN_AUTHOR',
                    'title': 'UNKNOWN_TITLE',
                    'year': 'UNKNOWN_YEAR'}

        pdf_parser = PDFParser(temp_pdf_file)

        try:
            pdf_doc = PDFDocument(pdf_parser)
            pdf_metadata = pdf_doc.info[0]

            author = make_pdf_metadata_str(pdf_metadata.get('Author', ''))
            if author and author != '':
                metadata['author'] = author

            title = make_pdf_metadata_str(pdf_metadata.get('Title', ''))
            if title and title != '':
                metadata['title'] = title

            year = pdf_metadata_moddate_to_year(make_pdf_metadata_str(pdf_metadata.get('ModDate', '')))
            if year and year != '':
                metadata['year'] = year
        except Exception as e:
            pass

        temp_pdf_file.close()

        return metadata

Example #13

Source File: parse_pdf.py From GraphIE with GNU General Public License v3.0

5 votes

def parse_case(case_path):
    """Parse all the pdf files in the folder."""
    try:
        result = {
            'id': case_path.split('/')[-2], 
            'docs': {}
        }

        for name in os.listdir(case_path):
            if name[0] == '.' or name[-4:] != '.pdf':
                continue
            doc_id = name.split('.')[0]
            result['docs'][doc_id] = {'pages': {}}
            doc_obj = result['docs'][doc_id]

            path = case_path + name
            fp = open(path, 'rb')
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams(detect_vertical=True, all_texts=True)
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                doc_obj['pages'][layout.pageid] = {
                    'size': (layout.width, layout.height),
                    'text': parse_text(layout)
                }
                # print(layout.width, layout.height)

        output = open(case_path + 'parsed.json', 'w')
        json.dump(result, output, indent=None)
    except:
        print("Error " + case_path)

    return None

Example #14

Source File: pdf.py From PassportEye with MIT License

5 votes

def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None

Example #15

Source File: pdf_utils.py From pdftotree with MIT License

5 votes

def analyze_pages(file_name, char_margin=1.0):
    """
    Input: the file path to the PDF file
    Output: yields the layout object for each page in the PDF
    """
    log = logging.getLogger(__name__)
    # Open a PDF file.
    with open(os.path.realpath(file_name), "rb") as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser, password="")
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(
            char_margin=char_margin, word_margin=0.1, detect_vertical=True
        )
        # Create a PDF page aggregator object.
        device = CustomPDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page_num, page in enumerate(PDFPage.create_pages(document)):
            try:
                interpreter.process_page(page)
            except OverflowError as oe:
                log.exception(
                    "{}, skipping page {} of {}".format(oe, page_num, file_name)
                )
                continue
            layout = device.get_result()
            yield layout

Example #16

Source File: data_mine.py From Bluto with GNU General Public License v3.0

4 votes

def pdf_read(pdf_file_list):
	info('Extracting PDF MetaData')
	software_list = []
	user_names = []
	for filename in pdf_file_list:
		info(filename)
		try:

			fp = open(filename, 'rb')
			parser = PDFParser(fp)
			doc = PDFDocument(parser)
			software = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Creator'])
			person = re.sub('[^0-9a-zA-Z]+', ' ', doc.info[0]['Author'])
			if person:
				oddity = re.match('(\s\w\s+(\w\s+)+\w)', person)
				if oddity:
					oddity = str(oddity.group(1)).replace(' ', '')
					user_names.append(str(oddity).title())
				else:
					user_names.append(str(person).title())
			if software:
				oddity2 = re.match('(\s\w\s+(\w\s+)+\w)', software)
				if oddity2:
					oddity2 = str(oddity2.group(1)).replace(' ', '')
					software_list.append(oddity2)
				else:
					software_list.append(software)
		except IndexError:
			continue
		except pdfminer.pdfparser.PDFSyntaxError:
			continue
		except KeyError:
			continue
		except TypeError:
			continue
		except Exception:
			info('An Unhandled Exception Has Occured, Please Check The Log For Details' + INFO_LOG_FILE)
			continue
	info('Finished Extracting PDF MetaData')
	return (user_names, software_list)



#Extract Author MS FILES

Example #17

Source File: pdftitle.py From pdftitle with GNU General Public License v3.0

4 votes

def get_title_from_io(pdf_io):
    # pylint: disable=too-many-locals
    parser = PDFParser(pdf_io)
    # if pdf is protected with a pwd, 2nd param here is password
    doc = PDFDocument(parser)

    # pdf may not allow extraction
    # pylint: disable=no-else-return
    if doc.is_extractable:
        rm = PDFResourceManager()
        dev = TextOnlyDevice(rm)
        interpreter = TextOnlyInterpreter(rm, dev)

        first_page = StringIO()
        converter = TextConverter(rm, first_page, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(rm, converter)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            page_interpreter.process_page(page)
            break

        converter.close()
        first_page_text = first_page.getvalue()
        first_page.close()
        dev.recover_last_paragraph()
        verbose('all blocks')

        for b in dev.blocks:
            verbose(b)

        # find max font size
        max_tfs = max(dev.blocks, key=lambda x: x[1])[1]
        verbose('max_tfs: ', max_tfs)
        # find max blocks with max font size
        max_blocks = list(filter(lambda x: x[1] == max_tfs, dev.blocks))
        # find the one with the highest y coordinate
        # this is the most close to top
        max_y = max(max_blocks, key=lambda x: x[3])[3]
        verbose('max_y: ', max_y)
        found_blocks = list(filter(lambda x: x[3] == max_y, max_blocks))
        verbose('found blocks')

        for b in found_blocks:
            verbose(b)
        block = found_blocks[0]
        title = ''.join(block[4]).strip()

        # Retrieve missing spaces if needed
        if " " not in title:
            title = retrieve_spaces(first_page_text, title)

        # Remove duplcate spaces if any are present
        if "  " in title:
            title = " ".join(title.split())

        return title
    else:
        return None

Example #18

Source File: pdfsheet.py From avrae with GNU General Public License v3.0

4 votes

def main():
    fn = input("PDF filename: ")
    character = {}
    with open(fn, mode='rb') as f:
        parser = PDFParser(f)
        doc = PDFDocument(parser)
        try:
            fields = resolve1(doc.catalog['AcroForm'])
            fields = resolve1(fields['Fields'])
        except:
            raise Exception('This is not a form-fillable character sheet!')
        for i in fields:
            field = resolve1(i)
            name, value = field.get('T'), field.get('V')
            if isinstance(value, PSLiteral):
                value = value.name
            elif value is not None:
                try:
                    value = value.decode('iso-8859-1').strip()
                except:
                    pass

            character[name.decode('iso-8859-1').strip()] = value

        print(character)
    with open('./output/pdfsheet-test.json', mode='w') as f:
        json.dump(character, f, skipkeys=True, sort_keys=True, indent=4)