#!/usr/bin/python # coding: utf-8 from wand.image import Image from PIL import Image as PI import pyocr import pyocr.builders import io import sys import argparse import time from tesserocr import PyTessBaseAPI, PSM, RIL import tesserocr import os import re class LocalOCR(object): def __init__(self, ocr_language): tools = pyocr.get_available_tools() if len(tools) == 0: print("No OCR tool found") sys.exit(1) self.tool = tools[0] print("OCR tool: %s" % self.tool) try: langs = self.tool.get_available_languages() self.lang = langs[0] if ocr_language in langs: self.lang = ocr_language print("OCR selected language: %s (available: %s)" % (self.lang.upper(), ", ".join(langs))) except Exception as e: print("{}".format(e)) def process(self, pdf_filename, pdf_resolution, imageformat, do_orientation): final_text = "" image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution) image_page = image_pdf.convert(imageformat) page = 1 process_start = time.time() for img in image_page.sequence: img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = pdf_resolution try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_per_page.save(filename="buffer.png") page_start = time.time() txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation) page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec. - text %s" % (page, img_per_page.size, page_elaboration, len(txt))) final_text += "%s\n" % txt page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end) return final_text def image2txt_pyocr(self, image, do_orientation): txt = "" orientation = "" img_per_page = PI.open(io.BytesIO(image)) if do_orientation is True: try: if self.tool.can_detect_orientation(): orientation = self.tool.detect_orientation(img_per_page, lang=self.lang) angle = orientation["angle"] if angle != 0: img_per_page.rotate(orientation["angle"]) except pyocr.PyocrException as exc: print("Orientation detection failed: {}".format(exc)) print("Orientation: {}".format(orientation)) try: txt = self.tool.image_to_string( img_per_page, lang=self.lang, builder=pyocr.builders.TextBuilder() ) except pyocr.error.TesseractError as e: print("{}".format(e)) return txt if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process input PDF file to CSV by OCR') parser.add_argument('pdf_filename', nargs='?', default='INPUT.pdf', help='Input PDF file') parser.add_argument('pdf_resolution', nargs='?', default=300, help='Input PDF dpi resolution') parser.add_argument('ocr_language', nargs='?', default='ita', help='OCR language') parser.add_argument('ocr_imageformat', nargs='?', default='png', help='OCR image format') parser.add_argument('ocr_do_orientation', nargs='?', default=True, help='OCR do orientation test') parser.add_argument('text_output', nargs='?', default="output.txt", help='OCR text output') args = parser.parse_args() if not args.pdf_filename: print('--filename is mandatory') sys.exit(1) p = pdf_to_txt(args.ocr_language) print("1. TEXT file \"%s\" not found - Process PDF file \"%s\"" % (args.text_output, args.pdf_filename)) output = p.process(args.pdf_filename, args.pdf_resolution, args.ocr_imageformat, args.ocr_do_orientation) print("2 Writing TEXT output file \"%s\"" % args.text_output) file = open(args.text_output, "w") for i in output: file.write(i.encode("utf-8")) file.close()