Python pdfminer.layout.LTTextBox() Examples

The following are 6 code examples of pdfminer.layout.LTTextBox(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.layout , or try the search function .
Example #1
Source File: pile.py    From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def parse_layout(self, layout):
        obj_stack = list(reversed(list(layout)))
        while obj_stack:
            obj = obj_stack.pop()
            if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
                obj_stack.extend(reversed(list(obj)))
            elif type(obj) == LTTextLineHorizontal:
                self.texts.append(obj)
            elif type(obj) == LTRect:
                if obj.width < 1.0:
                    self._adjust_to_close(obj, self.verticals, 'x0')
                    self.verticals.append(obj)
                elif obj.height < 1.0:
                    self._adjust_to_close(obj, self.horizontals, 'y0')
                    self.horizontals.append(obj)
            elif type(obj) == LTImage:
                self.images.append(obj)
            elif type(obj) == LTCurve:
                pass
            elif type(obj) == LTChar:
                pass
            elif type(obj) == LTLine:
                pass                    
            else:
                assert False, "Unrecognized type: %s" % type(obj) 
Example #2
Source File: pdf.py    From ChemDataExtractor with MIT License 5 votes vote down vote up
def _process_layout(self, layout):
        """Process an LTPage layout and return a list of elements."""
        # Here we just group text into paragraphs
        elements = []
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                elements.append(Paragraph(lt_obj.get_text().strip()))
            elif isinstance(lt_obj, LTFigure):
                # Recursive...
                elements.extend(self._process_layout(lt_obj))
        return elements 
Example #3
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 5 votes vote down vote up
def get_text_obj(cls, obj, index, regexp, text):
        otext = cls.get_entry_text(obj)
        if otext == text:
            return obj
        else:
            if isinstance(obj, layout.LTTextBox):
                i = 0
                for l in obj:
                    ret = cls.get_text_obj(l, text)
                    if ret:
                        return ret
            return None 
Example #4
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 5 votes vote down vote up
def try_add_field(cls, t, obj, results, nrows, nameoffset=0):
        if isinstance(obj, layout.LTTextLine):
            cls._try_add(t, obj, results, nrows, nameoffset)
        elif isinstance(obj, layout.LTTextBox):
            if not cls._try_add(t, obj, results, nrows, nameoffset): #only if add fails recurse
                for i in obj:
                    cls.try_add_field(t, i, results, nrows, nameoffset) 
Example #5
Source File: parse_am37x_register_tables.py    From bootloader_instrumentation_suite with MIT License 5 votes vote down vote up
def count_rows(cls, t, o, offset=0):
        info = t.col_info[TITable.NAME]
        count = 0
        if isinstance(o, layout.LTTextBox):
            for i in o:
                count += cls.count_rows(t, i, offset)
            return count
        elif isinstance(o, layout.LTTextLine):
            text = cls.get_entry_text(o)
            if abs(info.l - (o.bbox[0] + offset)) < 0.2:
                if info.regex.match(text):
                    return 1
        return 0 
Example #6
Source File: pdfConverter.py    From Forager with MIT License 4 votes vote down vote up
def convert_pdf_to_txt(path):
    fp = open(path, 'rb')
    txt = ''
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt)