Python pdfminer.layout.LTFigure() Examples

The following are 5 code examples of pdfminer.layout.LTFigure(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pdfminer.layout , or try the search function

Example #1

Source File: pile.py From pdf-to-markdown with BSD 3-Clause "New" or "Revised" License

7 votes

def parse_layout(self, layout):
        obj_stack = list(reversed(list(layout)))
        while obj_stack:
            obj = obj_stack.pop()
            if type(obj) in [LTFigure, LTTextBox, LTTextLine, LTTextBoxHorizontal]:
                obj_stack.extend(reversed(list(obj)))
            elif type(obj) == LTTextLineHorizontal:
                self.texts.append(obj)
            elif type(obj) == LTRect:
                if obj.width < 1.0:
                    self._adjust_to_close(obj, self.verticals, 'x0')
                    self.verticals.append(obj)
                elif obj.height < 1.0:
                    self._adjust_to_close(obj, self.horizontals, 'y0')
                    self.horizontals.append(obj)
            elif type(obj) == LTImage:
                self.images.append(obj)
            elif type(obj) == LTCurve:
                pass
            elif type(obj) == LTChar:
                pass
            elif type(obj) == LTLine:
                pass                    
            else:
                assert False, "Unrecognized type: %s" % type(obj)

Example #2

Source File: node.py From pdftotree with MIT License

5 votes

def elem_type(elem):
    if isinstance(elem, LTLine):
        return "line"
    if isinstance(elem, LTCurve):
        return "curve"
    if isinstance(elem, LTTextLine):
        return "text"
    if isinstance(elem, LTFigure):
        return "figure"
    return "unkown"

Example #3

Source File: pdf.py From PassportEye with MIT License

5 votes

def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(b'\xff\xd8\xff\xe0'):
                            return imdata

    return None

Example #4

Source File: pdf.py From ChemDataExtractor with MIT License

5 votes

def _process_layout(self, layout):
        """Process an LTPage layout and return a list of elements."""
        # Here we just group text into paragraphs
        elements = []
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                elements.append(Paragraph(lt_obj.get_text().strip()))
            elif isinstance(lt_obj, LTFigure):
                # Recursive...
                elements.extend(self._process_layout(lt_obj))
        return elements

Example #5

Source File: loaders.py From py-pdf-parser with MIT License

4 votes

def load(
    pdf_file: IO,
    pdf_file_path: Optional[str] = None,
    la_params: Optional[Dict] = None,
    **kwargs,
) -> PDFDocument:
    """
    Loads the pdf file into a PDFDocument.

    Args:
        pdf_file (io): The PDF file.
        la_params (dict): The layout parameters passed to PDF Miner for analysis. See
            the PDFMiner documentation here:
            https://pdfminersix.readthedocs.io/en/latest/api/composable.html#laparams.
            Note that py_pdf_parser will re-order the elements it receives from PDFMiner
            so options relating to element ordering will have no effect.
        pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
            for `PDFDocument`.
        kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.

    Returns:
        PDFDocument: A PDFDocument with the file loaded.
    """
    if la_params is None:
        la_params = {}
    la_params = {**DEFAULT_LA_PARAMS, **la_params}

    pages: Dict[int, Page] = {}
    for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
        elements = [element for element in page if isinstance(element, LTTextContainer)]

        # If all_texts=True then we may get some text from inside figures
        if la_params.get("all_texts"):
            figures = (element for element in page if isinstance(element, LTFigure))
            for figure in figures:
                elements += [
                    element
                    for element in figure
                    if isinstance(element, LTTextContainer)
                ]

        if not elements:
            logger.warning(
                f"No elements detected on page {page.pageid}, skipping this page."
            )
            continue

        pages[page.pageid] = Page(
            width=page.width, height=page.height, elements=elements
        )

    return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)