# ====================================================================== # ==================================== # README file for Page Cropping component # ==================================== # Filename : ocrd-anyBaseOCR-pagecropping.py # Author: Syed Saqib Bukhari, Mohammad Mohsin Reza, Md. Ajraf Rakib # Responsible: Syed Saqib Bukhari, Mohammad Mohsin Reza, Md. Ajraf Rakib # Contact Email: Saqib.Bukhari@dfki.de, Mohammad_mohsin.reza@dfki.de, Md_ajraf.rakib@dfki.de # Note: # 1) this work has been done in DFKI, Kaiserslautern, Germany. # 2) The parameters values are read from ocrd-anyBaseOCR-parameter.json file. The values can be changed in that file. # 3) The command line IO usage is based on "OCR-D" project guidelines (https://ocr-d.github.io/). A sample image file (samples/becker_quaestio_1586_00013.tif) and mets.xml (mets.xml) are provided. The sequence of operations is: binarization, deskewing, cropping and dewarping (or can also be: binarization, dewarping, deskewing, and cropping; depends upon use-case). # *********** Method Behaviour ******************** # This function takes a document image as input and crops/selects the page content # area only (that's mean remove textual noise as well as any other noise around page content area) # *********** Method Behaviour ******************** # *********** LICENSE ******************** # Copyright 2018 Syed Saqib Bukhari, Mohammad Mohsin Reza, Md. Ajraf Rakib # Apache License 2.0 # A permissive license whose main conditions require preservation of copyright # and license notices. Contributors provide an express grant of patent rights. # Licensed works, modifications, and larger works may be distributed under # different terms and without source code. # *********** LICENSE ******************** # ====================================================================== import os import numpy as np from pylsd.lsd import lsd import ocrolib import cv2 from PIL import Image from ..constants import OCRD_TOOL from ocrd import Processor from ocrd_modelfactory import page_from_file from ocrd_utils import ( getLogger, crop_image, concat_padded, MIMETYPE_PAGE, coordinates_for_segment, points_from_polygon ) from ocrd_models.ocrd_page import ( CoordsType, AlternativeImageType, to_xml, MetadataItemType, LabelsType, LabelType, ) from ocrd_models.ocrd_page_generateds import BorderType TOOL = 'ocrd-anybaseocr-crop' LOG = getLogger('OcrdAnybaseocrCropper') FALLBACK_IMAGE_GRP = 'OCR-D-IMG-CROP' class OcrdAnybaseocrCropper(Processor): def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(OcrdAnybaseocrCropper, self).__init__(*args, **kwargs) def write_crop_coordinate(self, base, coordinate): x1, y1, x2, y2 = coordinate with open(base + '-frame-pf.dat', 'w') as fp: fp.write(str(x1)+"\t"+str(y1)+"\t"+str(x2-x1)+"\t"+str(y2-y1)) def rotate_image(self, orientation, image): return image.rotate(orientation) def remove_rular(self, arg): #base = arg.split(".")[0] #img = cv2.cvtColor(arg, cv2.COLOR_RGB2BGR) gray = cv2.cvtColor(arg, cv2.COLOR_BGR2GRAY) contours, _ = cv2.findContours( gray, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) height, width, _ = arg.shape imgArea = height*width # Get bounding box x,y,w,h of each contours rects = [cv2.boundingRect(cnt) for cnt in contours] rects = sorted(rects, key=lambda x: (x[2]*x[3]), reverse=True) # consider those rectangle whose area>10000 and less than one-fourth of images rects = [r for r in rects if ( imgArea*self.parameter['maxRularArea']) > (r[2]*r[3]) > (imgArea*self.parameter['minRularArea'])] # detect child rectangles. Usually those are not ruler. Rular position are basically any one side. removeRect = [] for i, rect1 in enumerate(rects): (x1, y1, w1, h1) = rect1 for rect2 in rects[i+1:len(rects)]: (x2, y2, w2, h2) = rect2 if (x1 < x2) and (y1 < y2) and (x1+w1 > x2+w2) and (y1+h1 > y2+h2): removeRect.append(rect2) # removed child rectangles. rects = [x for x in rects if x not in removeRect] predictRular = [] for rect in rects: (x, y, w, h) = rect if (w < width*self.parameter['rularWidth']) and ((y > height*self.parameter['positionBelow']) or ((x+w) < width*self.parameter['positionLeft']) or (x > width*self.parameter['positionRight'])): if (self.parameter['rularRatioMin'] < round(float(w)/float(h), 2) < self.parameter['rularRatioMax']) or (self.parameter['rularRatioMin'] < round(float(h)/float(w), 2) < self.parameter['rularRatioMax']): blackPixel = np.count_nonzero(arg[y:y+h, x:x+w] == 0) predictRular.append((x, y, w, h, blackPixel)) # Finally check number of black pixel to avoid false rular if predictRular: predictRular = sorted( predictRular, key=lambda x: (x[4]), reverse=True) x, y, w, h, _ = predictRular[0] cv2.rectangle(arg, (x-15, y-15), (x+w+20, y+h+20), (255, 255, 255), cv2.FILLED) return arg def BorderLine(self, MaxBoundary, lines, index, flag, lineDetectH, lineDetectV): getLine = 1 LastLine = [] if flag in ('top', 'left'): for i in range(len(lines)-1): if(abs(lines[i][index]-lines[i+1][index])) <= 15 and lines[i][index] < MaxBoundary: LastLine = [lines[i][0], lines[i] [1], lines[i][2], lines[i][3]] getLine += 1 elif getLine >= 3: break else: getLine = 1 elif flag in ('bottom', 'right'): for i in reversed(list(range(len(lines)-1))): if(abs(lines[i][index]-lines[i+1][index])) <= 15 and lines[i][index] > MaxBoundary: LastLine = [lines[i][0], lines[i] [1], lines[i][2], lines[i][3]] getLine += 1 elif getLine >= 3: break else: getLine = 1 if getLine >= 3 and LastLine: if flag == "top": lineDetectH.append(( LastLine[0], max(LastLine[1], LastLine[3]), LastLine[2], max(LastLine[1], LastLine[3]) )) if flag == "left": lineDetectV.append(( max(LastLine[0], LastLine[2]), LastLine[1], max(LastLine[0], LastLine[2]), LastLine[3] )) if flag == "bottom": lineDetectH.append(( LastLine[0], min(LastLine[1], LastLine[3]), LastLine[2], min(LastLine[1], LastLine[3]) )) if flag == "right": lineDetectV.append(( min(LastLine[0], LastLine[2]), LastLine[1], min(LastLine[0], LastLine[2]), LastLine[3] )) def get_intersect(self, a1, a2, b1, b2): s = np.vstack([a1, a2, b1, b2]) # s for stacked h = np.hstack((s, np.ones((4, 1)))) # h for homogeneous l1 = np.cross(h[0], h[1]) # get first line l2 = np.cross(h[2], h[3]) # get second line x, y, z = np.cross(l1, l2) if z == 0: # return (float('inf'), float('inf')) return (0, 0) return (x/z, y/z) def detect_lines(self, arg): Hline = [] Vline = [] gray = cv2.cvtColor(arg, cv2.COLOR_RGB2GRAY) imgHeight, imgWidth, _ = arg.shape lines = lsd(gray) for i in range(lines.shape[0]): pt1 = (int(lines[i, 0]), int(lines[i, 1])) pt2 = (int(lines[i, 2]), int(lines[i, 3])) # consider those line whise length more than this orbitrary value if (abs(pt1[0]-pt2[0]) > 45) and ((int(pt1[1]) < imgHeight*0.25) or (int(pt1[1]) > imgHeight*0.75)): # make full horizontal line Hline.append([0, int(pt1[1]), imgWidth, int(pt2[1])]) if (abs(pt1[1]-pt2[1]) > 45) and ((int(pt1[0]) < imgWidth*0.4) or (int(pt1[0]) > imgWidth*0.6)): # make full vertical line Vline.append([int(pt1[0]), 0, int(pt2[0]), imgHeight]) Hline.sort(key=lambda x: (x[1]), reverse=False) Vline.sort(key=lambda x: (x[0]), reverse=False) return imgHeight, imgWidth, Hline, Vline def select_borderLine(self, arg, lineDetectH, lineDetectV): imgHeight, imgWidth, Hlines, Vlines = self.detect_lines(arg) # top side self.BorderLine(imgHeight*0.25, Hlines, 1, "top", lineDetectH, lineDetectV) # left side self.BorderLine(imgWidth*0.4, Vlines, 0, "left", lineDetectH, lineDetectV) # bottom side self.BorderLine(imgHeight*0.75, Hlines, 1, "bottom", lineDetectH, lineDetectV) # right side self.BorderLine(imgWidth*0.6, Vlines, 0, "right", lineDetectH, lineDetectV) intersectPoint = [] for l1 in lineDetectH: for l2 in lineDetectV: x, y = self.get_intersect( (l1[0], l1[1]), (l1[2], l1[3]), (l2[0], l2[1]), (l2[2], l2[3]) ) intersectPoint.append([x, y]) Xstart = 0 Xend = imgWidth Ystart = 0 Yend = imgHeight for i in intersectPoint: Xs = int(i[0])+10 if i[0] < imgWidth*0.4 else 10 if Xs > Xstart: Xstart = Xs Xe = int(i[0])-10 if i[0] > imgWidth*0.6 else int(imgWidth)-10 if Xe < Xend: Xend = Xe Ys = int(i[1])+10 if i[1] < imgHeight*0.25 else 10 # print("Ys,Ystart:",Ys,Ystart) if Ys > Ystart: Ystart = Ys Ye = int(i[1])-15 if i[1] > imgHeight*0.75 else int(imgHeight)-15 if Ye < Yend: Yend = Ye if Xend < 0: Xend = 10 if Yend < 0: Yend = 15 #self.save_pf(base, [Xstart, Ystart, Xend, Yend]) return [Xstart, Ystart, Xend, Yend] def filter_noisebox(self, textarea, height, width): tmp = [] st = True while st: textarea = [list(x) for x in textarea if x not in tmp] if len(textarea) > 1: tmp = [] textarea = sorted( textarea, key=lambda x: (x[3]), reverse=False) # print textarea x11, y11, x12, y12 = textarea[0] x21, y21, x22, y22 = textarea[1] if abs(y12-y21) > 100 and (float(abs(x12-x11)*abs(y12-y11))/(height*width)) < 0.001: tmp.append(textarea[0]) x11, y11, x12, y12 = textarea[-2] x21, y21, x22, y22 = textarea[-1] if abs(y12-y21) > 100 and (float(abs(x21-x22)*abs(y22-y21))/(height*width)) < 0.001: tmp.append(textarea[-1]) if len(tmp) == 0: st = False else: break return textarea def detect_textarea(self, arg): textarea = [] small = cv2.cvtColor(arg, cv2.COLOR_RGB2GRAY) height, width, _ = arg.shape kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) grad = cv2.morphologyEx(small, cv2.MORPH_GRADIENT, kernel) _, bw = cv2.threshold( grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU) kernel = cv2.getStructuringElement( cv2.MORPH_RECT, (10, 1)) # for historical docs connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel) contours, _ = cv2.findContours( connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) mask = np.zeros(bw.shape, dtype=np.uint8) for idx in range(len(contours)): x, y, w, h = cv2.boundingRect(contours[idx]) # print x,y,w,h mask[y:y+h, x:x+w] = 0 cv2.drawContours(mask, contours, idx, (255, 255, 255), -1) r = float(cv2.countNonZero(mask[y:y+h, x:x+w])) / (w * h) if r > 0.45 and (width*0.9) > w > 15 and (height*0.5) > h > 15: textarea.append([x, y, x+w-1, y+h-1]) cv2.rectangle(arg, (x, y), (x+w-1, y+h-1), (0, 0, 255), 2) if len(textarea) > 1: textarea = self.filter_noisebox(textarea, height, width) return textarea, arg, height, width def save_pf(self, base, textarea): x1, y1, x2, y2 = textarea img = Image.open(base+'.pf.png') img2 = img.crop((x1, y1, x2, y2)) img2.save(base + '.pf.png') self.write_crop_coordinate(base, textarea) def filter_area(self, textarea, binImg): height, width, _ = binImg.shape tmp = [] for area in textarea: if (height*width*self.parameter['minArea'] < (abs(area[2]-area[0]) * abs(area[3]-area[1]))): tmp.append(area) return tmp def marge_columns(self, textarea, colSeparator): tmp = [] marge = [] # height, _ = binImg.shape textarea.sort(key=lambda x: (x[0])) for i in range(len(textarea)-1): st = False x11, y11, x12, y12 = textarea[i] x21, y21, x22, y22 = textarea[i+1] if x21-x12 <= colSeparator: if len(marge) > 0: # print "marge ", marge[0] x31, y31, x32, y32 = marge[0] marge.pop(0) else: x31, y31, x32, y32 = [9999, 9999, 0, 0] marge.append([min(x11, x21, x31), min(y11, y21, y31), max(x12, x22, x32), max(y12, y22, y32)]) st = True else: tmp.append(textarea[i]) if not st: tmp.append(textarea[-1]) return tmp+marge def crop_area(self, textarea, binImg, rgb, colSeparator): height, width, _ = binImg.shape textarea = np.unique(textarea, axis=0) i = 0 tmp = [] areas = [] while i < len(textarea): textarea = [list(x) for x in textarea if x not in tmp] tmp = [] if len(textarea) == 0: break maxBox = textarea[0] for chkBox in textarea: if maxBox != chkBox: x11, y11, x12, y12 = maxBox x21, y21, x22, y22 = chkBox if ((x11 <= x21 <= x12) or (x21 <= x11 <= x22)): tmp.append(maxBox) tmp.append(chkBox) maxBox = [min(x11, x21), min(y11, y21), max(x12, x22), max(y12, y22)] if len(tmp) == 0: tmp.append(maxBox) x1, y1, x2, y2 = maxBox areas.append(maxBox) cv2.rectangle(rgb, (x1, y1), (x2, y2), (255, 0, 0), 2) i = i+1 textarea = np.unique(areas, axis=0).tolist() if len(textarea) > 0: textarea = self.filter_area(textarea, binImg) if len(textarea) > 1: textarea = self.marge_columns(textarea, colSeparator) # print textarea if len(textarea) > 0: textarea = sorted(textarea, key=lambda x: ( (x[2]-x[0])*(x[3]-x[1])), reverse=True) # print textarea x1, y1, x2, y2 = textarea[0] x1 = x1-20 if x1 > 20 else 0 x2 = x2+20 if x2 < width-20 else width y1 = y1-40 if y1 > 40 else 0 y2 = y2+40 if y2 < height-40 else height #self.save_pf(base, [x1, y1, x2, y2]) return textarea def process(self): """Performs border detection on the workspace. """ try: LOG.info("OUTPUT FILE %s", self.output_file_grp) page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( # externalRef="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='cropped',feature_selector='binarized') # should also be deskewed #page_image, page_xywh, page_image_info = self.workspace.image_from_page( # page, page_id, feature_filter='cropped') if oplevel == "page": self._process_segment( page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning( 'Operation level %s, but should be "page".', oplevel) break file_id = input_file.ID.replace( self.input_file_grp, page_grp) # Use input_file's basename for the new file - # this way the files retain the same basenames: if file_id == input_file.ID: file_id = concat_padded(page_grp, n) self.workspace.add_file( ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(page_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force'] ) def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n): # Get image orientation # orientation = page.get_orientation() # This function is not working # rotated_image = self.rotate_image(orientation, page_image) # img_array = ocrolib.pil2array(rotated_image) img_array = ocrolib.pil2array(page_image) # Check if image is RGB or not #FIXME: check not needed anymore? if len(img_array.shape) == 2: img_array = np.stack((img_array,)*3, axis=-1) img_array_bin = np.array( img_array > ocrolib.midrange(img_array), 'i') lineDetectH = [] lineDetectV = [] img_array_rr = self.remove_rular(img_array) textarea, img_array_rr_ta, height, width = self.detect_textarea( img_array_rr) colSeparator = int( width * self.parameter['colSeparator']) if len(textarea) > 1: textarea = self.crop_area( textarea, img_array_bin, img_array_rr_ta, colSeparator) if len(textarea) == 0: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) else: min_x, min_y, max_x, max_y = textarea[0] elif len(textarea) == 1 and (height*width*0.5 < (abs(textarea[0][2]-textarea[0][0]) * abs(textarea[0][3]-textarea[0][1]))): x1, y1, x2, y2 = textarea[0] x1 = x1-20 if x1 > 20 else 0 x2 = x2+20 if x2 < width-20 else width y1 = y1-40 if y1 > 40 else 0 y2 = y2+40 if y2 < height-40 else height min_x, min_y, max_x, max_y = textarea[0] else: min_x, min_y, max_x, max_y = self.select_borderLine( img_array_rr, lineDetectH, lineDetectV) border_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] border_polygon = coordinates_for_segment(border_polygon, page_image, page_xywh) border_points = points_from_polygon(border_polygon) brd = BorderType(Coords=CoordsType(border_points)) page.set_Border(brd) page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y)) page_xywh['features'] += ',cropped' file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(page_image, file_id, page_id=page_id, file_grp=self.image_grp, force=self.parameter['force']) page.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=page_xywh['features']))