Python pyocr.builders() Examples

The following are 7 code examples of pyocr.builders(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyocr , or try the search function .
Example #1
Source Project: PDFtoTXT   Author: lucab85   File: LocalOCR.py    License: MIT License 6 votes vote down vote up
def image2txt_pyocr(self, image, do_orientation):
        txt = ""
        orientation = ""
        img_per_page = PI.open(io.BytesIO(image))

        if do_orientation is True:
            try:
                if self.tool.can_detect_orientation():
                    orientation = self.tool.detect_orientation(img_per_page, lang=self.lang)
                    angle = orientation["angle"]
                    if angle != 0:
                        img_per_page.rotate(orientation["angle"])
            except pyocr.PyocrException as exc:
                print("Orientation detection failed: {}".format(exc))
            print("Orientation: {}".format(orientation))

        try:
            txt = self.tool.image_to_string(
                img_per_page, lang=self.lang,
                builder=pyocr.builders.TextBuilder()
            )
        except pyocr.error.TesseractError as e:
            print("{}".format(e))
        return txt 
Example #2
Source Project: PAN_OCR   Author: 008karan   File: tesseract_ocr.py    License: MIT License 5 votes vote down vote up
def ocr_one_image(self, area, image, threadList=-1, threadNum=None):
		print("Starting image...")
		txt = self.tool.image_to_string(image, lang=self.langs[0], builder=pyocr.builders.TextBuilder())
		print("==RESULT==" + str(area) + "\n" + txt + "\n==========================")
		if threadList != -1:
			threadList[threadNum] = txt
		return txt 
Example #3
Source Project: paperwork-backend   Author: openpaperwork   File: page.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        boxfile = self.__box_path

        try:
            box_builder = pyocr.builders.LineBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if boxes != []:
                return boxes
            # fallback: old format: word boxes
            # shouldn't be used anymore ...
            box_builder = pyocr.builders.WordBoxBuilder()
            with self.fs.open(boxfile, 'r') as file_desc:
                boxes = box_builder.read_file(file_desc)
            if len(boxes) <= 0:
                return []
            logger.warning("WARNING: Doc %s uses old box format" %
                           (str(self.doc)))
            return [pyocr.builders.LineBox(boxes, boxes[0].position)]
        except IOError as exc:
            logger.error("Unable to get boxes for '%s': %s"
                         % (self.doc.docid, exc))
            return [] 
Example #4
Source Project: paperwork-backend   Author: openpaperwork   File: page.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __set_boxes(self, boxes):
        boxfile = self.__box_path
        with self.fs.open(boxfile, 'w') as file_desc:
            pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes) 
Example #5
Source Project: paperwork-backend   Author: openpaperwork   File: page.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __set_boxes(self, boxes):
        boxfile = self.__get_box_path()
        with self.fs.open(boxfile, 'w') as file_desc:
            pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes) 
Example #6
Source Project: OCR-Manga   Author: klaxa   File: Reader.py    License: GNU Affero General Public License v3.0 5 votes vote down vote up
def image_to_dict(self, image):
        bid = self.box_oid
        mode = 5
        size = image.size
        image = image.resize((size[0] * 3, size[1] * 3), Image.BICUBIC)
        if size[0] / size[1] < 1.15 and size[1] / size[0] < 1.15:
            mode = 10
        if size[0] > size[1] * 1.5:
            mode = 7
        string = self.image_to_string(image, lang="jpn",
                                      builder=pyocr.builders.TextBuilder(mode))
        string = string_filtered = "".join([c for c in string.strip()
                                            if c not in special_chars])
        self.draw("Looking up " + string)
        if string != "":
            dict_entry = myougiden_api.run(string)
        else:
            dict_entry = None
        # image.save("/tmp/export.png")
        if dict_entry is not None and string != "":
            string = dict_entry.strip("\n")
        if string == "":
            string = "Nothing recognized"
        # print(string)
        return textwrap.fill(string, 120, replace_whitespace=False,
                             drop_whitespace=False) 
Example #7
Source Project: paperwork-backend   Author: openpaperwork   File: page.py    License: GNU General Public License v3.0 4 votes vote down vote up
def __get_boxes(self):
        """
        Get all the word boxes of this page.
        """
        if self.__boxes is not None:
            return self.__boxes

        # Check first if there is an OCR file available
        boxfile = self.__get_box_path()
        if self.fs.exists(boxfile):
            box_builder = pyocr.builders.LineBoxBuilder()

            try:
                with self.fs.open(boxfile, 'r') as file_desc:
                    self.__boxes = box_builder.read_file(file_desc)
                return self.__boxes
            except IOError as exc:
                logger.error("Unable to get boxes for '%s': %s"
                             % (self.doc.docid, exc))
                # will fall back on pdf boxes

        # fall back on what libpoppler tells us

        txt = self.pdf_page.get_text()
        self.__boxes = []

        layout = self.pdf_page.get_text_layout()
        if not layout[0]:
            layout = []
            return self.__boxes
        layout = layout[1]

        for (line, line_rects) in custom_split(
            txt, layout, lambda x: x == "\n"
        ):
            words = []
            for (word, word_rects) in custom_split(
                line, line_rects, lambda x: x.isspace()
            ):
                word_box = PdfWordBox(word, word_rects)
                words.append(word_box)
            line_box = PdfLineBox(words, line_rects)
            self.__boxes.append(line_box)
        return self.__boxes