import os import subprocess import sys import time import PIL.Image as Im import pyocr import pyocr.builders from PIL import Image as Im from wand.image import Image VALIDITY = [".jpg",".gif",".png",".tga",".tif",".bmp", ".pdf"] FNULL = open(os.devnull, 'w') #Open file in write mode to The file path of the null device. For example: '/dev/null' path = "" class ArgumentMissingException(Exception): def __init__(self): print("usage: {} <dirname>".format(sys.argv[0])) sys.exit(1) class saram(object): def __init__(self, path): ocr_language = 'eng' path = path #if call(['which', 'tesseract']): #Run the command described by args # print("tesseract-ocr missing") #No tesseract installed tools = pyocr.get_available_tools() if len(tools) == 0: print("No OCR tool found") sys.exit(1) self.tool = tools[0] print("OCR tool: %s" % self.tool) try: langs = self.tool.get_available_languages() self.lang = langs[0] if ocr_language in langs: self.lang = ocr_language print("OCR selected language: %s (available: %s)" % (self.lang.upper(), ", ".join(langs))) except Exception as e: print("{}".format(e)) def create_directory(self, path): if not os.path.exists(path): #No path os.makedirs(path) #Create path def pdf_run(self, image_file_name, filename, path): image_pdf = Image(filename=image_file_name, resolution=300) #take filename image_page = image_pdf.convert("png") #png conversion page = 1 #init page process_start = time.time() for img in image_page.sequence: # Every single image in image_page for grayscale conversion in 300 resolution img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = 300 try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_buf = path + '/' + "saram_" + filename + str(page) + ".png" os.chmod(path, 0o777) img_per_page.save(filename=img_buf) page_start = time.time() page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec." % (page, img_per_page.size, page_elaboration)) page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end) def get_rotation_info(self, filename): arguments = ' %s - -psm 0' stdoutdata = subprocess.getoutput('tesseract' + arguments % filename) degrees = None for line in stdoutdata.splitlines(): print(line) info = 'Orientation in degrees: ' if info in line: degrees = -float(line.replace(info, '').strip()) return degrees def fix_dpi_and_rotation(self, filename, degrees, ext): im1 = Im.open(filename) print('Fixing rotation %.2f in %s...' % (degrees, filename)) im1.rotate(degrees).save(filename) def main(self, path): if bool(os.path.exists(path)): directory_path = path + '/OCR-text/' #Create text_conversion folder count = 0 other_files = 0 for f in os.listdir(path): ext = os.path.splitext(f)[1] #Split the pathname path into a pair i.e take .png/ .jpg etc if ext.lower() == ".pdf": #For PDF image_file_name = path + '/' + f #Full /dir/path/filename.extension filename = os.path.splitext(f)[0] #Filename without extension filename = ''.join(e for e in filename if e.isalnum() or e == '-') #Join string of filename if it contains alphanumeric characters or - self.pdf_run(image_file_name, filename, path) for f in os.listdir(path): ext = os.path.splitext(f)[1] #Split the pathname path into a pair i.e take .png/ .jpg etc if ext.lower() == ".pdf": #For PDF continue if ext.lower() in VALIDITY: image_file_name = path + '/' + f #Full /dir/path/filename.extension degrees = self.get_rotation_info(image_file_name) print(degrees) if degrees: self.fix_dpi_and_rotation(image_file_name, degrees, ext) for f in os.listdir(path): #Return list of files in path directory ext = os.path.splitext(f)[1] #Split the pathname path into a pair i.e take .png/ .jpg etc image_file_name = path + '/' + f #Full /dir/path/filename.extension filename = os.path.splitext(f)[0] #Filename without extension filename = ''.join(e for e in filename if e.isalnum() or e == '-') #Join string of filename if it contains alphanumeric characters or - text_file_path = directory_path + filename #Join dir_path with file_name if ext.lower() not in VALIDITY: #Convert to lowercase and check in validity list other_files += 1 #Increment if other than validity extension found continue if count == 0: #No directory created self.create_directory(directory_path) #function to create directory count += 1 if ext.lower() == ".pdf": #For PDF continue else: degrees = self.get_rotation_info(image_file_name) if degrees: self.fix_dpi_and_rotation(image_file_name, degrees, ext) subprocess.call(["tesseract", image_file_name, text_file_path], stdout=FNULL) #Fetch tesseract with FNULL in write mode print(str(count) + (" file" if count == 1 else " files") + " processed") for f in os.listdir(path): if f.startswith("saram_"): os.remove(os.path.join(path, f)) if count + other_files == 0: print("No files found") #No files found else : print(str(count) + " / " + str(count + other_files) + " files converted") else : print("No directory : " + format(path)) def start(): if len(sys.argv) != 2: # Count number of arguments which contains the command-line arguments passed to the script if it is not equal to 2 ie for (py main.py 1_arg 2_arg) raise ArgumentMissingException path = sys.argv[1] #python main.py "path_to/img_dir" ie the argv[1] value path = os.path.abspath(path) #Accesing filesystem for Return a normalized absolutized version of the pathname path s = saram(path) s.main(path) # Def main to path