Python pdfminer.pdfpage.PDFPage.get_pages() Examples

The following are 24 code examples of pdfminer.pdfpage.PDFPage.get_pages(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.pdfpage.PDFPage , or try the search function .
Example #1
Source File: utils.py    From ResumeParser with MIT License 12 votes vote down vote up
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle, codec='utf-8', laparams=LAParams())
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
 
            text = fake_file_handle.getvalue()
            yield text
 
            # close open handles
            converter.close()
            fake_file_handle.close() 
Example #2
Source File: pdf.py    From blueflower with GNU General Public License v3.0 11 votes vote down vote up
def pdf_do_pdf(astream, afile):
    outstream = io.BytesIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams,
                               imagewriter=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        for page in PDFPage.get_pages(astream, set(),
                                      maxpages=0, password='',
                                      caching=True, check_extractable=True):
            interpreter.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        log_error(str(e), afile)
        return
    text = outstream.getvalue()
    text_do_data(text, afile)
    outstream.close() 
Example #3
Source File: autosumpdf.py    From autosum with MIT License 7 votes vote down vote up
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()

    return text 
Example #4
Source File: resumeparser.py    From resume-parser with MIT License 7 votes vote down vote up
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
#Function to extract names from the string using spacy 
Example #5
Source File: convert_pdf.py    From python-tools with MIT License 6 votes vote down vote up
def convert_pdf(input_file, format='text', codec='utf-8'):
    """Convert PDF file to text or html.

    Args:
        input_file (str): Input PDF file.
        format (str): Format text or html.
        codec (str): Codec for encode the text.

    Returns:
        str: Return text or html from PDF file.

    """
    manager = PDFResourceManager()
    output = BytesIO()
    laparams = LAParams()
    if format == 'text':
        converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    elif format == 'html':
        converter = HTMLConverter(manager, output, codec=codec, laparams=laparams)

    with open(input_file, 'rb') as f1:
        interpreter = PDFPageInterpreter(manager, converter)
        for page in PDFPage.get_pages(f1,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

        converter.close()
        text = output.getvalue()
        output.close()

    return text.decode() 
Example #6
Source File: pdf_miner.py    From ocr-table with MIT License 6 votes vote down vote up
def convert(fname):
    pages=None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    print(text)

    # write to .txt
    text_file = open("output.txt", "w")
    text = re.sub("\s\s+", " ", text.decode('utf-8'))
    text_file.write("%s" % text)
    text_file.close() 
Example #7
Source File: pdf_utils.py    From keras-english-resume-parser-and-analyzer with MIT License 6 votes vote down vote up
def pdf_to_text(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    result = []
    for line in text.split('\n'):
        line2 = line.strip()
        if line2 != '':
            result.append(line2)
    return result 
Example #8
Source File: Parser.py    From ioc_parser with MIT License 6 votes vote down vote up
def parse_pdf_pdfminer(self, f, fpath):
		try:
			laparams = LAParams()
			laparams.all_texts = True  
			rsrcmgr = PDFResourceManager()
			pagenos = set()

			if self.dedup:
				self.dedup_store = set()

			self.handler.print_header(fpath)
			page_num = 0
			for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
				page_num += 1

				retstr = StringIO()
				device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
				interpreter = PDFPageInterpreter(rsrcmgr, device)
				interpreter.process_page(page)
				data = retstr.getvalue()
				retstr.close()

				self.parse_page(fpath, data, page_num)
			self.handler.print_footer(fpath)
		except (KeyboardInterrupt, SystemExit):
			raise 
Example #9
Source File: pdfToText.py    From python-automation-scripts with GNU General Public License v3.0 6 votes vote down vote up
def convertPdfToText(path):  #converts all pdf pages to text
    rsrcmgr=PDFResourceManager()
    retstr=StringIO()
    codec='utf-8'
    laparams=LAParams()
    device=TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp=file(path, 'rb')
    filename=path
    interpreter=PDFPageInterpreter(rsrcmgr, device)
    maxpages=0
    caching=True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password="",caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    writeToText(text,absolute_path_shortner(path)) 
Example #10
Source File: converter.py    From cvscan with MIT License 6 votes vote down vote up
def pdf_to_txt(file_name):
  try:
    file_pointer = open(file_name,'rb')

    # Setting up pdf reader
    pdf_resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(pdf_resource_manager, return_string, codec=codec, \
      laparams=laparams)
    interpreter = PDFPageInterpreter(pdf_resource_manager, device)

    for page in PDFPage.get_pages(file_pointer, set(), maxpages=0, password="",
      caching=True, check_extractable=True):
      interpreter.process_page(page)
    file_pointer.close()
    device.close()

    # Get full string from PDF
    pdf_txt = return_string.getvalue()
    return_string.close()

    # logging.debug(pdf_txt)

    # Formatting removing and replacing special characters
    pdf_txt = pdf_txt.replace("\r", "\n")
    pdf_txt = re.sub(regex.bullet, " ", pdf_txt)

    return pdf_txt.decode('ascii', errors='ignore')

  except Exception, exception_instance:
    logging.error('Error converting pdf to txt: '+str(exception_instance))
    return '' 
Example #11
Source File: pdf.py    From yeti with Apache License 2.0 6 votes vote down vote up
def do_import(self, results, filepath):
        buff = StringIO()
        fp = open(filepath, 'rb')

        laparams = LAParams()
        laparams.all_texts = True
        rsrcmgr = PDFResourceManager()
        pagenos = set()

        page_num = 0
        for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
            page_num += 1

            device = TextConverter(
                rsrcmgr, buff, codec='utf-8', laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            interpreter.process_page(page)

            buff.write("\n")

        results.investigation.update(import_text=buff.getvalue())

        fp.close()
        buff.close() 
Example #12
Source File: youdao.py    From FengTools with MIT License 6 votes vote down vote up
def pdf_2_txt(pdf) :
    outfile = pdf + '.txt'
    args = [pdf]

    debug = 0
    pagenos = set()
    password = ''
    maxpages = 0
    rotation = 0
    codec = 'utf-8'   #输出编码
    caching = True
    imagewriter = None
    laparams = LAParams()

    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile,'w',encoding="utf8")
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,imagewriter=imagewriter)
    for fname in args:
        fp = open(fname,'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        #处理文档对象中每一页的内容
        for page in PDFPage.get_pages(fp, pagenos,
                          maxpages=maxpages, password=password,
                          caching=caching, check_extractable=True) :
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return outfile 
Example #13
Source File: Converter.py    From SimplyEmail with GNU General Public License v3.0 6 votes vote down vote up
def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e)) 
Example #14
Source File: oa_pdf.py    From oadoi with MIT License 6 votes vote down vote up
def convert_pdf_to_txt(r, max_pages=3):
    text = None

    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    if r.status_code != 200:
        logger.info(u"error: status code {} in convert_pdf_to_txt".format(r.status_code))
        return None

    if not r.encoding:
        r.encoding = "utf-8"
    fp = StringIO(r.content_big())

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp, pagenos, maxpages=max_pages, password=password, caching=caching, check_extractable=True)

    for page in pages:
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    # logger.info(text)
    return text 
Example #15
Source File: custom_t.py    From pyresparser with GNU General Public License v3.0 5 votes vote down vote up
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                file_name,
                caching=True,
                check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None 
Example #16
Source File: annotations_parser.py    From cvscan with MIT License 5 votes vote down vote up
def fetch_pdf_urls(file_name):
  try:
    links = []
    file_pointer = open(file_name,'rb')

    # Setting up pdf document
    pdf_pages = PDFPage.get_pages(file_pointer)

    # fetches URLs
    for page in pdf_pages:
      if 'Annots' in page.attrs.keys():
        link_object_list = page.attrs['Annots']
        # Due to implementation of pdfminer the link_object_list can either
        # be the list directly or a PDF Object reference
        if type(link_object_list) is not list:
          link_object_list = link_object_list.resolve()
        for link_object in link_object_list:
          if type(link_object) is not dict:
            link_object = link_object.resolve()
          if link_object['A']['URI']:
            links.append(link_object['A']['URI'])
    file_pointer.close()
    return links

  except Exception, exception_instance:
    logging.error('Error while fetching URLs : '+str(exception_instance))
    return '' 
Example #17
Source File: extract_pdfs.py    From radremedy with Mozilla Public License 2.0 5 votes vote down vote up
def process_pdf(in_path, out_path):
    """
    Processes a PDF and extracts its contents to HTML.

    Args:
        in_path: The full path to the source PDF file.
        out_path: The full path to the destination HTML file.
    """
    page_numbers=set()

    # Get source/destination file handles
    in_file = file(in_path, 'rb')
    out_file = file(out_path, 'w')

    # Set up the resource manager, device, and interpreter
    res_mgr = PDFResourceManager()
    device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None)
    interpreter = PDFPageInterpreter(res_mgr, device)

    for page in PDFPage.get_pages(in_file, page_numbers, 
            maxpages=0, password="", 
            caching=True, check_extractable=True):
        interpreter.process_page(page)

    # Close all the file handles
    in_file.close()
    device.close()
    out_file.close()
    return 
Example #18
Source File: utils.py    From pyresparser with GNU General Public License v3.0 5 votes vote down vote up
def get_number_of_pages(file_name):
    try:
        if isinstance(file_name, io.BytesIO):
            # for remote pdf file
            count = 0
            for page in PDFPage.get_pages(
                        file_name,
                        caching=True,
                        check_extractable=True
            ):
                count += 1
            return count
        else:
            # for local pdf file
            if file_name.endswith('.pdf'):
                count = 0
                with open(file_name, 'rb') as fh:
                    for page in PDFPage.get_pages(
                            fh,
                            caching=True,
                            check_extractable=True
                    ):
                        count += 1
                return count
            else:
                return None
    except PDFSyntaxError:
        return None 
Example #19
Source File: extract.py    From x86doc with The Unlicense 5 votes vote down vote up
def main(argv):
	for arg in argv[1:]:
		fd = open(arg)
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print "Document not extractable."
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print "Processing page %i" % i
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()
	
		print "Conversion result: %i/%i" % (parser.success, parser.success + parser.fail) 
Example #20
Source File: iocp.py    From connectors with Apache License 2.0 5 votes vote down vote up
def parse_pdf_pdfminer(self, f, fpath):
        try:
            list_pages = []
            laparams = LAParams()
            laparams.all_texts = True
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()
                list_pages.append(self.parse_page(fpath, data, page_num))
            self.handler.print_footer(fpath)
            return list_pages
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e) 
Example #21
Source File: helper.py    From resilient-community-apps with MIT License 5 votes vote down vote up
def extract_text_from_pdf(cls, attachment_input):
        """
        Wrapper to convert bytes data in into PDF file and extracting the text data from .pdf file
        :param attachment_input:  attachment Bytes data from resilient api call
        :return:  Text Data
        """
        # Set logs for pdfminer to ERROR as too much noise in logs
        logging.getLogger('pdfminer').setLevel(logging.ERROR)

        resource_manager = PDFResourceManager()
        # To Handle unicode conversion in python 2 and python 3
        if six.PY2:
            fake_file_handle = io.BytesIO()
        else:
            fake_file_handle = io.StringIO()

        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        extracted_input = u""

        with tempfile.NamedTemporaryFile(mode="w+b", delete=True) as temp_pdf_file:
            try:
                # Write and close temp file
                temp_pdf_file.write(attachment_input)

                # Reading the Data from Created Temp File
                for page in PDFPage.get_pages(temp_pdf_file, caching=True, check_extractable=True):
                    page_interpreter.process_page(page)

                extracted_input = fake_file_handle.getvalue()
            except Exception as error_msg:
                raise ValueError("Failed Convert .pdf files data to string format. Error: {0}".format(error_msg))
            finally:
                # close open handles
                converter.close()
                fake_file_handle.close()
        return extracted_input 
Example #22
Source File: web_pdf_reading.py    From accel-brain-code with GNU General Public License v2.0 4 votes vote down vote up
def path_to_text(self, path):
        '''
        Transform local PDF file to string.

        Args:
            path:   path to PDF file.

        Returns:
            string.

        '''
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        pages_data = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True
        )

        for page in pages_data:
            interpreter.process_page(page)

        text = retstr.getvalue()
        text = text.replace("\n", "")

        fp.close()
        device.close()
        retstr.close()
        return text 
Example #23
Source File: update_itunes.py    From xlinkBook with MIT License 4 votes vote down vote up
def itunesPdfParser(self, data):
        outfile = data+'.txt'
        fp = file(data, 'rb')
        outfp = file(outfile,'w')
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = "utf-8"
        laparams = LAParams()
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.


        for page in PDFPage.get_pages(fp):
            interpreter.process_page(page)
            data =  retstr.getvalue()

        device.close()
        outfp.close()

        f = open(outfile,'r')
        result = ''
        course_list = []
        i = 0
        for line in f.readlines():
            if len(line.strip()) > 0:
                i += 1
                if i < 3 and line.find('…') == -1:
                    result += line.strip() + ' '
            else:
                if result.strip() == 'Explore App Store':
                    break
                if result.find('iTunes U') != -1 or result.find('Featured Featured') != -1:
                    result = ''
                    i = 0
                    continue
                course_list.append(result)
                #print result
                result = ''
                i = 0
        f.close()
        os.remove(outfile)
        return course_list 
Example #24
Source File: pdfminer_wrapper.py    From invoice2data with MIT License 4 votes vote down vote up
def to_text(path):
    """Wrapper around `pdfminer`.

    Parameters
    ----------
    path : str
        path of electronic invoice in PDF

    Returns
    -------
    str : str
        returns extracted text from pdf

    """

    try:
        # python 2
        from StringIO import StringIO
        import sys

        reload(sys)  # noqa: F821
        sys.setdefaultencoding("utf8")
    except ImportError:
        from io import StringIO

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, "rb") as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        )
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str.encode("utf-8")