Python pdfminer.pdfinterp.PDFResourceManager() Examples

The following are 30 code examples of pdfminer.pdfinterp.PDFResourceManager(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.pdfinterp , or try the search function .
Example #1
Source File: utils.py    From ResumeParser with MIT License 12 votes vote down vote up
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
 
            text = fake_file_handle.getvalue()
            yield text
 
            # close open handles
            converter.close()
            fake_file_handle.close() 
Example #2
Source File: pdf.py    From blueflower with GNU General Public License v3.0 11 votes vote down vote up
def pdf_do_pdf(astream, afile):
    outstream = io.BytesIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams,
                               imagewriter=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        for page in PDFPage.get_pages(astream, set(),
                                      maxpages=0, password='',
                                      caching=True, check_extractable=True):
            interpreter.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        log_error(str(e), afile)
        return
    text = outstream.getvalue()
    text_do_data(text, afile)
    outstream.close() 
Example #3
Source File: autosumpdf.py    From autosum with MIT License 7 votes vote down vote up
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()

    return text 
Example #4
Source File: pdf_to_txt.py    From DLink_Harvester with GNU General Public License v3.0 7 votes vote down vote up
def convert(fp):
    logger = logging.getLogger()
    logger.propagate = False
    logging.getLogger().setLevel(logging.ERROR)
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    pagenos=set()
    maxpages=0
    password=''
    with StringIO() as output:
        try:
            device = TextConverter(rsrcmgr, output, laparams=LAParams())
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                        caching=caching, check_extractable=True)
            return output.getvalue()
        finally:
            device.close() 
Example #5
Source File: resumeparser.py    From resume-parser with MIT License 7 votes vote down vote up
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
#Function to extract names from the string using spacy 
Example #6
Source File: pdf_miner.py    From ocr-table with MIT License 6 votes vote down vote up
def convert(fname):
    pages=None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    print(text)

    # write to .txt
    text_file = open("output.txt", "w")
    text = re.sub("\s\s+", " ", text.decode('utf-8'))
    text_file.write("%s" % text)
    text_file.close() 
Example #7
Source File: BaseTestClasses.py    From email2pdf with MIT License 6 votes vote down vote up
def getPDFText(self, filename):
        try:
            with io.StringIO() as retstr:
                with open(filename, 'rb') as filehandle:
                    rsrcmgr = PDFResourceManager()
                    device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
                    pagenos = set()
                    process_pdf(rsrcmgr, device, filehandle, pagenos, maxpages=0, password="", caching=True, check_extractable=True)
                    device.close()
                    string = retstr.getvalue()
                    return string
        except PSException:
            return None 
Example #8
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 6 votes vote down vote up
def process_pdf(cls, pdf, output, verbose=False, tables=None):
        parser = pdfparser.PDFParser(pdf)
        document = pdfdocument.PDFDocument(parser)
        rsrcmgr = pdfinterp.PDFResourceManager(caching=True)

        params = layout.LAParams(line_margin=0.4, word_margin=0.1, char_margin=2,
                                 line_overlap=0.4, boxes_flow=0.5)
        device = converter.PDFPageAggregator(rsrcmgr, laparams=params)

        interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        outlines = document.get_outlines()
        registers = {}
        pages = dict((pageno, page) for (pageno, page)
                     in enumerate(pdfpage.PDFPage.create_pages(document)))
        for xref in document.xrefs:
            for oid in xref.get_objids():
                obj = document.getobj(oid)
                if type(obj) == dict:
                    if"Title" in obj.iterkeys() and "List of Tables" in obj['Title']:
                        pageoid = obj['A'].resolve()['D'][0].objid
                        (pageno, page) = [(pn, p) for (pn, p) in pages.iteritems()
                                          if p.pageid == pageoid][0]
                        cls.process_table_index(parser, document, rsrcmgr, params, device,
                                                interpreter, pages, page, pageno, output,
                                                verbose, tables)
                        return 
Example #9
Source File: metadataPDF.py    From EasY_HaCk with Apache License 2.0 6 votes vote down vote up
def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e 
Example #10
Source File: Parser.py    From ioc_parser with MIT License 6 votes vote down vote up
def parse_pdf_pdfminer(self, f, fpath):
		try:
			laparams = LAParams()
			laparams.all_texts = True  
			rsrcmgr = PDFResourceManager()
			pagenos = set()

			if self.dedup:
				self.dedup_store = set()

			self.handler.print_header(fpath)
			page_num = 0
			for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
				page_num += 1

				retstr = StringIO()
				device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
				interpreter = PDFPageInterpreter(rsrcmgr, device)
				interpreter.process_page(page)
				data = retstr.getvalue()
				retstr.close()

				self.parse_page(fpath, data, page_num)
			self.handler.print_footer(fpath)
		except (KeyboardInterrupt, SystemExit):
			raise 
Example #11
Source File: pdfToText.py    From python-automation-scripts with GNU General Public License v3.0 6 votes vote down vote up
def convertPdfToText(path):  #converts all pdf pages to text
    rsrcmgr=PDFResourceManager()
    retstr=StringIO()
    codec='utf-8'
    laparams=LAParams()
    device=TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp=file(path, 'rb')
    filename=path
    interpreter=PDFPageInterpreter(rsrcmgr, device)
    maxpages=0
    caching=True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password="",caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    writeToText(text,absolute_path_shortner(path)) 
Example #12
Source File: main.py    From pdf2word with MIT License 6 votes vote down vote up
def read_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        resource_manager = PDFResourceManager()
        return_str = StringIO()
        lap_params = LAParams()

        device = TextConverter(
            resource_manager, return_str, laparams=lap_params)
        process_pdf(resource_manager, device, file)
        device.close()

        content = return_str.getvalue()
        return_str.close()
        return content 
Example #13
Source File: metadataPDF.py    From Yuki-Chan-The-Auto-Pentest with MIT License 6 votes vote down vote up
def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e 
Example #14
Source File: converter.py    From cvscan with MIT License 6 votes vote down vote up
def pdf_to_txt(file_name):
  try:
    file_pointer = open(file_name,'rb')

    # Setting up pdf reader
    pdf_resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(pdf_resource_manager, return_string, codec=codec, \
      laparams=laparams)
    interpreter = PDFPageInterpreter(pdf_resource_manager, device)

    for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="",
      caching=True, check_extractable=True):
      interpreter.process_page(page)
    file_pointer.close()
    device.close()

    # Get full string from PDF
    pdf_txt = return_string.getvalue()
    return_string.close()

    # logging.debug(pdf_txt)

    # Formatting removing and replacing special characters
    pdf_txt = pdf_txt.replace("\r", "\n")
    pdf_txt = re.sub(regex.bullet, " ", pdf_txt)

    return pdf_txt.decode('ascii', errors='ignore')

  except Exception, exception_instance:
    logging.error('Error converting pdf to txt: '+str(exception_instance))
    return '' 
Example #15
Source File: pdf.py    From yeti with Apache License 2.0 6 votes vote down vote up
def do_import(self, results, filepath):
        buff = StringIO()
        fp = open(filepath, 'rb')

        laparams = LAParams()
        laparams.all_texts = True
        rsrcmgr = PDFResourceManager()
        pagenos = set()

        page_num = 0
        for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
            page_num += 1

            device = TextConverter(
                rsrcmgr, buff, codec='utf-8', laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            interpreter.process_page(page)

            buff.write("\n")

        results.investigation.update(import_text=buff.getvalue())

        fp.close()
        buff.close() 
Example #16
Source File: oa_pdf.py    From oadoi with MIT License 6 votes vote down vote up
def convert_pdf_to_txt(r, max_pages=3):
    text = None

    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    if r.status_code != 200:
        logger.info(u"error: status code {} in convert_pdf_to_txt".format(r.status_code))
        return None

    if not r.encoding:
        r.encoding = "utf-8"
    fp = StringIO(r.content_big())

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True)

    for page in pages:
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    # logger.info(text)
    return text 
Example #17
Source File: Converter.py    From SimplyEmail with GNU General Public License v3.0 6 votes vote down vote up
def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e)) 
Example #18
Source File: convert_pdf.py    From python-tools with MIT License 6 votes vote down vote up
def convert_pdf(input_file, format='text', codec='utf-8'):
    """Convert PDF file to text or html.

    Args:
        input_file (str): Input PDF file.
        format (str): Format text or html.
        codec (str): Codec for encode the text.

    Returns:
        str: Return text or html from PDF file.

    """
    manager = PDFResourceManager()
    output = BytesIO()
    laparams = LAParams()
    if format == 'text':
        converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    elif format == 'html':
        converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)

    with open(input_file, 'rb') as f1:
        interpreter = PDFPageInterpreter(manager, converter)
        for page in PDFPage.get_pages(f1,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

        converter.close()
        text = output.getvalue()
        output.close()

    return text.decode() 
Example #19
Source File: pdf_utils.py    From keras-english-resume-parser-and-analyzer with MIT License 6 votes vote down vote up
def pdf_to_text(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    result = []
    for line in text.split('\n'):
        line2 = line.strip()
        if line2 != '':
            result.append(line2)
    return result 
Example #20
Source File: metadataPDF.py    From ITWSV with MIT License 6 votes vote down vote up
def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e 
Example #21
Source File: helper.py    From resilient-community-apps with MIT License 5 votes vote down vote up
def extract_text_from_pdf(cls, attachment_input):
        """
        Wrapper to convert bytes data in into PDF file and extracting the text data from .pdf file
        :param attachment_input:  attachment Bytes data from resilient api call
        :return:  Text Data
        """
        # Set logs for pdfminer to ERROR as too much noise in logs
        logging.getLogger('pdfminer').setLevel(logging.ERROR)

        resource_manager = PDFResourceManager()
        # To Handle unicode conversion in python 2 and python 3
        if six.PY2:
            fake_file_handle = io.BytesIO()
        else:
            fake_file_handle = io.StringIO()

        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        extracted_input = u""

        with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as temp_pdf_file:
            try:
                # Write and close temp file
                temp_pdf_file.write(attachment_input)

                # Reading the Data from Created Temp File
                for page in PDFPage.get_pages(temp_pdf_file, caching=True, check_extractable=True):
                    page_interpreter.process_page(page)

                extracted_input = fake_file_handle.getvalue()
            except Exception as error_msg:
                raise ValueError("Failed Convert .pdf files data to string format. Error: {0}".format(error_msg))
            finally:
                # close open handles
                converter.close()
                fake_file_handle.close()
        return extracted_input 
Example #22
Source File: extract.py    From x86doc with The Unlicense 5 votes vote down vote up
def main(argv):
	for arg in argv[1:]:
		fd = open(arg)
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print "Document not extractable."
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print "Processing page %i" % i
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()
	
		print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail) 
Example #23
Source File: iocp.py    From connectors with Apache License 2.0 5 votes vote down vote up
def parse_pdf_pdfminer(self, f, fpath):
        try:
            list_pages = []
            laparams = LAParams()
            laparams.all_texts = True
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()
                list_pages.append(self.parse_page(fpath, data, page_num))
            self.handler.print_footer(fpath)
            return list_pages
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e) 
Example #24
Source File: pdf_decoder.py    From anack with GNU General Public License v3.0 5 votes vote down vote up
def parse():
    fp = open(path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'out.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n') 
Example #25
Source File: extracthl2.py    From Menotexport with GNU General Public License v3.0 5 votes vote down vote up
def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device


#----------------Get the latest creation time of annos---------------- 
Example #26
Source File: extracthl.py    From Menotexport with GNU General Public License v3.0 5 votes vote down vote up
def init(filename,verbose=True):
    '''Initiate analysis objs
    '''

    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    return document, interpreter, device
    



#----------------Get the latest creation time of annos---------------- 
Example #27
Source File: formatFun.py    From China_stock_announcement with MIT License 5 votes vote down vote up
def p2t(sourcefile, outfile):
    with open(sourcefile, 'rb') as fp:
        # 来创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档对象存储文档结构
        try:
            document = PDFDocument(parser)
        except:
            print(sourcefile + ' :pdf未正确下载')
        # 检查文件是否允许文本提取
        else:
            if not document.is_extractable:
                print(sourcefile + ' :不允许提取文本')
             # 创建一个PDF资源管理器对象来存储共赏资源
            rsrcmgr=PDFResourceManager()
             # 设定参数进行分析
            laparams=LAParams()
             # 创建一个PDF设备对象
             # device=PDFDevice(rsrcmgr)
            device=PDFPageAggregator(rsrcmgr,laparams=laparams)
             # 创建一个PDF解释器对象
            interpreter=PDFPageInterpreter(rsrcmgr,device)
             # 处理每一页
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
             # 接受该页面的LTPage对象
                layout=device.get_result()
                for x in layout:
                 if(isinstance(x,LTTextBoxHorizontal)):
                     with open(outfile, 'a') as f:
                         f.write(x.get_text().encode('utf-8')+'\n')
            print(sourcefile + '  已转为 ' + outfile)

##############################################把doc转为txt##############################################
# 调用之前要确保你在linux 下装了catdoc 
Example #28
Source File: Form.py    From opentaxforms with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self):
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # la=layout analysis
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.textPoz = None 
Example #29
Source File: parser.py    From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _prepare_tools(self):
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        return device, interpreter 
Example #30
Source File: pdfjinja.py    From pdfjinja with MIT License 5 votes vote down vote up
def parse_pdf(self, fp):
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        device = PDFDevice(rsrcmgr)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pgnum, page in enumerate(PDFPage.create_pages(doc)):
            interpreter.process_page(page)
            page.annots and self.parse_annotations(pgnum, page)