Python pyocr.builders() Examples
The following are 7
code examples of pyocr.builders().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyocr
, or try the search function
.

Example #1
Source Project: PDFtoTXT Author: lucab85 File: LocalOCR.py License: MIT License | 6 votes |
def image2txt_pyocr(self, image, do_orientation): txt = "" orientation = "" img_per_page = PI.open(io.BytesIO(image)) if do_orientation is True: try: if self.tool.can_detect_orientation(): orientation = self.tool.detect_orientation(img_per_page, lang=self.lang) angle = orientation["angle"] if angle != 0: img_per_page.rotate(orientation["angle"]) except pyocr.PyocrException as exc: print("Orientation detection failed: {}".format(exc)) print("Orientation: {}".format(orientation)) try: txt = self.tool.image_to_string( img_per_page, lang=self.lang, builder=pyocr.builders.TextBuilder() ) except pyocr.error.TesseractError as e: print("{}".format(e)) return txt
Example #2
Source Project: PAN_OCR Author: 008karan File: tesseract_ocr.py License: MIT License | 5 votes |
def ocr_one_image(self, area, image, threadList=-1, threadNum=None): print("Starting image...") txt = self.tool.image_to_string(image, lang=self.langs[0], builder=pyocr.builders.TextBuilder()) print("==RESULT==" + str(area) + "\n" + txt + "\n==========================") if threadList != -1: threadList[threadNum] = txt return txt
Example #3
Source Project: paperwork-backend Author: openpaperwork File: page.py License: GNU General Public License v3.0 | 5 votes |
def __get_boxes(self): """ Get all the word boxes of this page. """ boxfile = self.__box_path try: box_builder = pyocr.builders.LineBoxBuilder() with self.fs.open(boxfile, 'r') as file_desc: boxes = box_builder.read_file(file_desc) if boxes != []: return boxes # fallback: old format: word boxes # shouldn't be used anymore ... box_builder = pyocr.builders.WordBoxBuilder() with self.fs.open(boxfile, 'r') as file_desc: boxes = box_builder.read_file(file_desc) if len(boxes) <= 0: return [] logger.warning("WARNING: Doc %s uses old box format" % (str(self.doc))) return [pyocr.builders.LineBox(boxes, boxes[0].position)] except IOError as exc: logger.error("Unable to get boxes for '%s': %s" % (self.doc.docid, exc)) return []
Example #4
Source Project: paperwork-backend Author: openpaperwork File: page.py License: GNU General Public License v3.0 | 5 votes |
def __set_boxes(self, boxes): boxfile = self.__box_path with self.fs.open(boxfile, 'w') as file_desc: pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes)
Example #5
Source Project: paperwork-backend Author: openpaperwork File: page.py License: GNU General Public License v3.0 | 5 votes |
def __set_boxes(self, boxes): boxfile = self.__get_box_path() with self.fs.open(boxfile, 'w') as file_desc: pyocr.builders.LineBoxBuilder().write_file(file_desc, boxes)
Example #6
Source Project: OCR-Manga Author: klaxa File: Reader.py License: GNU Affero General Public License v3.0 | 5 votes |
def image_to_dict(self, image): bid = self.box_oid mode = 5 size = image.size image = image.resize((size[0] * 3, size[1] * 3), Image.BICUBIC) if size[0] / size[1] < 1.15 and size[1] / size[0] < 1.15: mode = 10 if size[0] > size[1] * 1.5: mode = 7 string = self.image_to_string(image, lang="jpn", builder=pyocr.builders.TextBuilder(mode)) string = string_filtered = "".join([c for c in string.strip() if c not in special_chars]) self.draw("Looking up " + string) if string != "": dict_entry = myougiden_api.run(string) else: dict_entry = None # image.save("/tmp/export.png") if dict_entry is not None and string != "": string = dict_entry.strip("\n") if string == "": string = "Nothing recognized" # print(string) return textwrap.fill(string, 120, replace_whitespace=False, drop_whitespace=False)
Example #7
Source Project: paperwork-backend Author: openpaperwork File: page.py License: GNU General Public License v3.0 | 4 votes |
def __get_boxes(self): """ Get all the word boxes of this page. """ if self.__boxes is not None: return self.__boxes # Check first if there is an OCR file available boxfile = self.__get_box_path() if self.fs.exists(boxfile): box_builder = pyocr.builders.LineBoxBuilder() try: with self.fs.open(boxfile, 'r') as file_desc: self.__boxes = box_builder.read_file(file_desc) return self.__boxes except IOError as exc: logger.error("Unable to get boxes for '%s': %s" % (self.doc.docid, exc)) # will fall back on pdf boxes # fall back on what libpoppler tells us txt = self.pdf_page.get_text() self.__boxes = [] layout = self.pdf_page.get_text_layout() if not layout[0]: layout = [] return self.__boxes layout = layout[1] for (line, line_rects) in custom_split( txt, layout, lambda x: x == "\n" ): words = [] for (word, word_rects) in custom_split( line, line_rects, lambda x: x.isspace() ): word_box = PdfWordBox(word, word_rects) words.append(word_box) line_box = PdfLineBox(words, line_rects) self.__boxes.append(line_box) return self.__boxes