#!/usr/bin/env python from __future__ import print_function import matplotlib # matplotlib.use('WXAgg') import cv2 import numpy as np import glob import matplotlib.pyplot as plt import database_connection from scipy import stats import tesseract_font import cassandra import csv from sklearn.decomposition import PCA from sklearn.cluster import DBSCAN from sklearn import datasets, linear_model from sklearn.svm import SVR from sklearn.grid_search import GridSearchCV from sklearn.learning_curve import learning_curve import random # diabetes = datasets.load_diabetes() # diabetes_X = diabetes.data[:, np.newaxis, 2] # diabetes_X_train = diabetes_X[:-20] # print(diabetes_X_train.shape) # print(type(diabetes_X_train)) # assert False class ActiveWeather: def __init__(self): try: self.cass_db = None self.cass_db = database_connection.Database() print("connected to the db") except cassandra.cluster.NoHostAvailable: print("could not connect to the db - will recalculate all values from scratch") self.cass_db = None # just for size reference # self.reference_subject = "Bear-AG-29-1940-0019" # self.reference_image = cv2.imread("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/"+self.reference_subject+".JPG") # self.refer_shape = self.reference_image.shape # # self.horizontal_grid = self.cass_db.__get_horizontal_lines__(self.reference_subject,0) # self.vertical_grid = self.cass_db.__get_vertical_lines__(self.reference_subject,0) # # self.horizontal_grid,self.vertical_grid = self.__get_grid__() self.region = 0 self.classifier = tesseract_font.ActiveTess() def __cross_validate__(self,pts): random.shuffle(pts) step = len(pts)/10 x,y = zip(*pts) plt.plot(x,[-i for i in y],"o") plt.show() overall_err = [] for i in range(10): validate_data = pts[i*step:(i+1)*step] test_data = pts[:i*step] test_data.extend(pts[(i+1)*step:]) x,y = zip(*test_data) num_degrees = 5 p = list(reversed(np.polyfit(x,y,num_degrees))) X,Y = zip(*validate_data) # predicted_y = [int(p[0]+p[1]*x+p[2]*x**2+p[3]*x**3) for x in X] predicted_y = [int(sum([p[d]*x**d for d in range(num_degrees+1)])) for x in X] err = sum([(p-y)**2 for (p,y) in zip(predicted_y,Y)])/float(len(Y)) overall_err.append(err) print(np.mean(overall_err)) def __interpolation__(self,shape,grid): template = np.zeros(shape,dtype=np.uint8) region_bounds = (559,3282,1276,2097) # use only the first 5 images - should be enough but we can change that if need be image = cv2.imread("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0019.JPG",0) horizontal_image = self.__sobel_image__(image,True) contours_to_return = self.__contour_extraction__(horizontal_image,True) # x,y = zip(*grid[0]) cv2.drawContours(template,contours_to_return,-1,255,-1) zoomed_image = template[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] contours_to_return = self.__contour_extraction__(zoomed_image,True,False) all_x,all_y = [],[] new_template = np.zeros((zoomed_image.shape[0],zoomed_image.shape[1],3),dtype=np.uint8) for cnt in contours_to_return: perimeter = cv2.arcLength(cnt,True) if perimeter > 300: temp_template = np.zeros(zoomed_image.shape,dtype=np.uint8) cv2.drawContours(temp_template,[cnt],0,255,-1) plt.imshow(temp_template) plt.show() y,x = np.where(temp_template>0) self.__cross_validate__(zip(x,y)) continue # temp_template = np.zeros(zoomed_image.shape,dtype=np.uint8) cv2.drawContours(new_template,[cnt],0,(255,255,255),-1) # x,y = np.where(template>0) # print(len(x)) # plt.imshow(temp_template) # plt.show() # print(cnt) # cv2.drawContours(new_template,[cnt],0,255,-1) s = cnt.shape cnt = np.reshape(cnt,(s[0],s[2])) cnt = random.sample(cnt,1000) # x,y = zip(*cnt) new_x = sorted(list(set(x))) new_x = np.asarray(new_x) new_x = np.reshape(new_x,(new_x.shape[0],)) print(min(x),max(x)) x = np.asarray(x) x = np.reshape(x,(x.shape[0],)) y = np.asarray(y) y = np.reshape(y,(y.shape[0],)) # regr = linear_model.LinearRegression() # regr.fit(x, y) # # print("Residual sum of squares: %.2f" # % np.median((regr.predict(x) - y) ** 2)) # # print('Variance score: %.2f' % regr.score(x, y)) # print((new_x[0],new_x[-1])) # print(len(new_x)) p = np.polyfit(x,y,3) print(p) y_t = [int(p[3]+p[2]*x+p[1]*x**2+p[0]*x**3) for x in new_x] all_x.append(new_x) all_y.append(y_t) # for x in new_x: # y = int(p[2]+p[1]*x+p[0]*x**2) # print(x,y) # new_template[y,x,:] = (255,0,0) # pts = np.asarray(zip(new_x,y_t)) # print(pts.shape) # print(list(pts)) # cv2.drawContours(new_template,[pts],0,(255,0,0),2) # cv2.drawContours(new_template,p,(255,0,0),1) # plt.plot(new_x,y_t,color='blue',linewidth=3) for x,y in zip(all_x,all_y): # print(x) # print(y) plt.plot(x,y,"-") plt.imshow(new_template) plt.show() def __directory_to_subjects__(self,directory): """ take directory of aligned images and convert them into column based subjects for upload to Panoptes :param directory: :return: """ if directory[-1] != "/": directory += "/" # todo - make this more robust region_bounds = (559,3282,1276,2097) if self.cass_db is None: # we don't have a connection the db - so going to recalulate everything from scratch horizontal_grid,vertical_grid = self.__get_grid_for_table__(directory,region_bounds) else: # todo - read in from db horizontal_grid,vertical_grid = self.__get_grid_for_table__(directory,region_bounds) # self.horizontal_grid = self.cass_db.__get_horizontal_lines__(self.reference_subject,0) # self.vertical_grid = self.cass_db.__get_vertical_lines__(self.reference_subject,0) # todo - put this code inside the db call # uncomment - if you want to save the results to the cassandra db # self.cass_db.__add_horizontal_lines__(reference_subject,0,horizontal_lines) # self.cass_db.__add_vertical_lines__(reference_subject,0,vertical_lines) reference_image = cv2.imread("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0019.JPG") ref_shape = reference_image.shape[:2] # print(horizontal_grid) self.__interpolation__(ref_shape,horizontal_grid) assert False # todo - generalize to more than one region confidence_over_all_cells = [] bad_count = 0 for fname in glob.glob(directory+"*.JPG")[:15]: print(fname) # first_pass_columns, second_pass_columns, original_columns = self.__process_region__(fname,region_bounds,horizontal_grid,vertical_grid) files = self.__process_region__(fname,region_bounds,horizontal_grid,vertical_grid) for column_index,(init_threshold,pca_threshold) in enumerate(files): is_blank = self.classifier.__is_blank__(init_threshold) print(init_threshold) print(pca_threshold) if not is_blank: text,column_confidence,bb_boxes = self.classifier.__process_column__(pca_threshold) confidence_over_all_cells.extend(column_confidence) print("==---") print(text) print(column_confidence) text,column_confidence = self.__put_transcriptions_in_cells__(text,column_confidence,bb_boxes,horizontal_grid,vertical_grid,column_index,ref_shape) for row_index,confidence in enumerate(column_confidence): try: if (confidence is not None) and (confidence < 80): # print((bad_count,zip(text,column_confidence))) border = self.__extract_cell_borders__(horizontal_grid,vertical_grid,row_index,column_index,ref_shape) column = cv2.imread(pca_threshold) # # todo - refactor and figure out why the heck column[border] doesn't work for (x,y) in border: column[x,y,0] = 0 column[x,y,1] = 0 column[x,y,2] = 255 cv2.imwrite("/home/ggdhines/bad/bad_"+str(bad_count)+".jpg",column) print((bad_count,confidence,text[row_index])) bad_count += 1 raw_input("enter something") except cv2.error: print("error - skipping") # print(confidence_over_all_cells) n, bins, patches = plt.hist(confidence_over_all_cells, 80, normed=1, histtype='step', cumulative=True) plt.show() def __put_transcriptions_in_cells__(self,text,confidence,bb_boxes,horizontal_grid,vertical_grid,column_index,reference_shape): column_confidence = [] column_text = [] for row_index in range(12): current_text = None current_confidence = None cell = self.__extract_cell_borders__(horizontal_grid,vertical_grid,row_index,column_index,reference_shape) cell_y,_ = zip(*cell) cell_top = min(cell_y) cell_bottom = max(cell_y) for index,(top,bottom) in enumerate(bb_boxes): in_cell = ((cell_bottom >= bottom) and (cell_top <= top)) or ((cell_bottom < bottom) and (cell_top > bottom)) if in_cell and current_text is None: current_text = text[index] current_confidence = confidence[index] elif in_cell: current_text += text[index] current_confidence = min(current_confidence,confidence[index]) column_confidence.append(current_confidence) column_text.append(current_text) return column_text,column_confidence # for column_index,(fname1,fname2,fname3) in enumerate(zip(first_pass_columns,second_pass_columns,original_columns)): # is_blank = self.classifier.__is_blank__(fname1) # if not is_blank: # text,column_confidence = self.classifier.__process_column__(fname2) # confidence_over_all_cells.extend(column_confidence) # # for row_index,confidence in enumerate(column_confidence): # if confidence < 50: # border = self.__extract_cell_borders__(horizontal_grid,vertical_grid,row_index,column_index,ref_shape) # column = cv2.imread(fname3) # # # # todo - refactor and figure out why the heck column[border] doesn't work # # for (x,y) in border: # # column[x,y,0] = 0 # # column[x,y,1] = 0 # # column[x,y,2] = 255 # # cv2.imwrite("/home/ggdhines/subject_"+str(bad_count)+".jpg",column) # shape = column.shape # s = column.reshape((shape[0]*shape[1],3)).astype(np.float) # pca = PCA(n_components=1) # print(s) # X_r = pca.fit_transform(s) # X_negative = X_r<0 # X_r[X_negative] = 0 # print(shape) # print(X_r) # print(X_r.shape) # redone_column = X_r.reshape(shape[:2]) # plt.imshow(redone_column) # plt.show() # # # n, bins, patches = plt.hist(s, 50, normed=1, facecolor='green', alpha=0.5) # # m = np.median(s) # # column = column.astype(np.float) # # column -= m # # plt.imshow(column) # # plt.show() # # # # # # column = np.abs(column) # # background = column<60 # # column[background] = 255 # # cv2.normalize(column,column,0,255,cv2.NORM_MINMAX) # # cv2.imwrite("/home/ggdhines/testing.jpg",column) # # plt.imshow(column,cmap="gray") # # plt.show() # # # # # column = cv2.imread(fname2) # for (x,y) in border: # column[x,y,0] = 0 # column[x,y,1] = 0 # column[x,y,2] = 255 # # cv2.imwrite("/home/ggdhines/thresholded_subject_"+str(bad_count)+".jpg",column) # # column = cv2.imread(fname1) # for (x,y) in border: # column[x,y,0] = 0 # column[x,y,1] = 0 # column[x,y,2] = 255 # # cv2.imwrite("/home/ggdhines/thresholded1_subject_"+str(bad_count)+".jpg",column) # bad_count += 1 # # text,column_confidence = self.classifier.__process_column__(fname1) # print(zip(text,column_confidence)[row_index]) # text,column_confidence = self.classifier.__process_column__(fname2) # print(zip(text,column_confidence)[row_index]) # assert False def __extract_cell_borders__(self,horizontal_grid,vertical_grid,row_index,column_index,reference_shape,fname=None): """ :param image: :param v_index: :return: """ mask = np.zeros(reference_shape,np.uint8) mask2 = np.zeros(reference_shape,np.uint8) cv2.drawContours(mask,horizontal_grid,row_index,255,-1) cv2.drawContours(mask,horizontal_grid,row_index+1,255,-1) cv2.drawContours(mask,vertical_grid,column_index,255,-1) cv2.drawContours(mask,vertical_grid,column_index+1,255,-1) _,contours, hier = cv2.findContours(mask.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) # contours are probably in sorted order but just to be sure # looking for the one interior contour for c,h in zip(contours,hier[0]): if h[-1] == -1: continue cv2.drawContours(mask2,[c],0,255,1) border_y,border_x = np.where(mask2>0) # now we need to normalize these values - relative to the region we are extracting them from t = horizontal_grid[0] _,min_y = np.min(t,axis=0) border_y -= min_y # now make the x values relative to the column we are extracting them from t = vertical_grid[column_index] min_x,_ = np.min(t,axis=0) border_x -= min_x return zip(border_y,border_x) def __sobel_image__(self,image,horizontal): """ apply the sobel operator to a given image on either the vertical or horizontal axis basically copied from http://stackoverflow.com/questions/10196198/how-to-remove-convexity-defects-in-a-sudoku-square :param horizontal: :return: """ if horizontal: dy = cv2.Sobel(image,cv2.CV_16S,0,2) dy = cv2.convertScaleAbs(dy) cv2.normalize(dy,dy,0,255,cv2.NORM_MINMAX) ret,close = cv2.threshold(dy,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(10,2)) else: dx = cv2.Sobel(image,cv2.CV_16S,2,0) dx = cv2.convertScaleAbs(dx) cv2.normalize(dx,dx,0,255,cv2.NORM_MINMAX) ret,close = cv2.threshold(dx,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) kernel = cv2.getStructuringElement(cv2.MORPH_RECT,(2,10)) close = cv2.morphologyEx(close,cv2.MORPH_CLOSE,kernel) return close def __contour_extraction__(self,image,horizontal,approximation = True): """ extract all the horizontal or vertical contours from an image strongly inspired by http://stackoverflow.com/questions/10196198/how-to-remove-convexity-defects-in-a-sudoku-square :param image: :param horizontal: :return: """ contours_to_return = [] if approximation: _,contour, hier = cv2.findContours(image.copy(),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE) else: _,contour, hier = cv2.findContours(image.copy(),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_NONE) for cnt in contour: x,y,w,h = cv2.boundingRect(cnt) if (horizontal and w/h > 5) or ((not horizontal) and h/w > 5): contours_to_return.append(cnt) return contours_to_return def __get_contour_lines_over_image__(self,directory,horizontal,fname): """ return the contours lines for a subject set of already aligned subjects if horizontal, return only the horizontal contours. Otherwise, return the vertical contours returns the contours over all the image - we still have to trim to the specific region :param horizontal: :return: """ # todo - currently hard coded to work with only Bear 1940 # lined_images is the set of every aligned image after we have applied the sobel operator to it # i.e. extracted either the vertical or horizontal lines lined_images = [] # use only the first 5 images - should be enough but we can change that if need be files = sorted(list(glob.glob(directory+"*.JPG"))) index = files.index(fname) for f in files[index:index+5]: image = cv2.imread(f,0) lined_images.append(self.__sobel_image__(image,horizontal)) # the average image is the 40th percentile average_image = np.percentile(lined_images,40,axis=0) # convert back to np.uint8 so we have a proper image average_image = average_image.astype(np.uint8) if horizontal: cv2.imwrite("/home/ggdhines/horizontal_average.jpg",average_image) else: cv2.imwrite("/home/ggdhines/vertical_image.jpg",average_image) contours_to_return = self.__contour_extraction__(average_image,horizontal) return contours_to_return def __get_grid_for_table__(self,directory,region,fname): """ directory - contains a set of aligned images extract the grid for a given region/table the region/table is specified by min_x,max_x,min_y,max_y :return: """ assert region[0]<region[1] assert region[2]<region[3] # todo - refactor!! horizontal_lines = [] vertical_lines = [] # extract all horizontal lines horizontal_contours = self.__get_contour_lines_over_image__(directory,True,fname) # useful for when you want to draw out the image - just for debugging mask = np.zeros((3744,5616),dtype=np.uint8) delta = 50 for cnt in horizontal_contours: shape = cnt.shape cnt = cnt.reshape((shape[0],shape[2])) max_x,max_y = np.max(cnt,axis=0) min_x,min_y = np.min(cnt,axis=0) if (min_y>=region[2]-delta) and (max_y<=region[3]+delta): # sanity check - if this an actual grid line - or just a blip? perimeter = cv2.arcLength(cnt,True) if perimeter > 100: horizontal_lines.append(cnt) # cv2.drawContours(mask,[cnt],0,255,1) horizontal_lines.sort(key = lambda l:l[0][1]) vertical_contours = self.__get_contour_lines_over_image__(directory,False,fname) delta = 400 for cnt in vertical_contours: shape = cnt.shape cnt = cnt.reshape((shape[0],shape[2])) max_x,max_y = np.max(cnt,axis=0) min_x,min_y = np.min(cnt,axis=0) interior_line = (min_x >= region[0]-100) and (max_x <= region[1]+100)and(min_y>=region[2]-delta) and (max_y<=region[3]+delta) through_line = (min_x >= region[0]-100) and (max_x <= region[1]+100) and (min_y < region[2]) and(max_y > region[3]) if interior_line or through_line: perimeter = cv2.arcLength(cnt,True) if perimeter > 1000: vertical_lines.append(cnt) vertical_lines.sort(key = lambda l:l[0][0]) cv2.drawContours(mask,vertical_lines,0,255,-1) cv2.drawContours(mask,vertical_lines,1,255,-1) return horizontal_lines,vertical_lines def __extract_column__(self,image,column_index,vertical_grid,region_bounds): # get the region coordinates - so we can convert global grid line coordinates to # local ones (relative to just the grid line) t = vertical_grid[column_index] # t = t.reshape((t.shape[0],t.shape[2])) min_x,_ = np.min(t,axis=0) t = vertical_grid[column_index+1] # t = t.reshape((t.shape[0],t.shape[2])) max_x,_ = np.max(t,axis=0) # print(((min_x,max_x,region_bounds[0]))) column = image[:,(min_x-region_bounds[0]):(max_x-region_bounds[0]+1)] return column def __region_mask__(self,reference_image,horizontal_grid,vertical_grid): """ use the first and last horizontal/vertical grid lines to make a mask around the desired region/table :return: """ reference_shape = reference_image.shape # [:2] in case we read in the image in colour format - doesn't seem necessary to throw an error # the first mask will be an outline of the region, sort of like #. The second mask will fill in the # central interior box mask = np.zeros(reference_shape[:2],np.uint8) mask2 = np.zeros(mask.shape,np.uint8) # draw the first and last horizontal/vertical grid lines to create a box cv2.drawContours(mask,horizontal_grid,0,255,-1) cv2.drawContours(mask,horizontal_grid,len(horizontal_grid)-2,255,-1) cv2.drawContours(mask,vertical_grid,0,255,-1) cv2.drawContours(mask,vertical_grid,len(vertical_grid)-1,255,-1) # find the (hopefully) one interior contour - should be our mask _,contours, hier = cv2.findContours(mask.copy(),cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE) assert len(contours) == 1 for c,h in zip(contours,hier[0]): if h[-1] == -1: continue cv2.drawContours(mask2,[c],0,255,-1) return mask2 def __process_region__(self,fname,region_bounds,horizontal_grid,vertical_grid): files = [] initial_threshed_image = self.__initial_threshold__(fname,region_bounds,horizontal_grid,vertical_grid) threshed_image = self.__pca_thresholding__(fname,region_bounds,horizontal_grid,vertical_grid) for column_index in range(len(vertical_grid)-1): init_column = self.__extract_column__(initial_threshed_image,column_index,vertical_grid,region_bounds) fname1 = "/home/ggdhines/active/init_"+str(column_index)+".jpg" cv2.imwrite(fname1,init_column) column = self.__extract_column__(threshed_image,column_index,vertical_grid,region_bounds) fname = "/home/ggdhines/active/pca_"+str(column_index)+".jpg" cv2.imwrite(fname,column) files.append((fname1,fname)) return files # assert False # first_files = [] # second_files = [] # original_files= [] # first_pass,second_pass = self.__extract_region__(fname,region_bounds,horizontal_grid,vertical_grid) # # # first # for column_index in range(len(vertical_grid)-1): # column = self.__extract_column__(first_pass,column_index,vertical_grid,region_bounds) # fname = "/home/ggdhines/first_"+str(column_index)+".jpg" # cv2.imwrite(fname,column) # first_files.append(fname) # # # first # for column_index in range(len(vertical_grid)-1): # column = self.__extract_column__(second_pass,column_index,vertical_grid,region_bounds) # fname = "/home/ggdhines/second_"+str(column_index)+".jpg" # cv2.imwrite(fname,column) # second_files.append(fname) # # for column_index in range(len(vertical_grid)-1): # column = self.__extract_column__(original,column_index,vertical_grid,region_bounds) # fname = "/home/ggdhines/original_"+str(column_index)+".jpg" # cv2.imwrite(fname,column) # original_files.append(fname) # # return first_files,second_files,original_files def __pca_thresholding__(self,fname,region_bounds,horizontal_grid,vertical_grid): # image = cv2.imread(fname,0) # second_pass = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,301,2) # cv2.drawContours(second_pass,horizontal_grid,-1,255,-1) # cv2.drawContours(second_pass,vertical_grid,-1,255,-1) # second_pass = self.__image_clean__(second_pass) # # zoom in # second_pass = second_pass[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] # plt.imshow(second_pass,cmap="gray") # plt.show() image = cv2.imread(fname) image_shape = image.shape # cv2.drawContours(image,horizontal_grid,-1,255,-1) # cv2.drawContours(image,vertical_grid,-1,255,-1) original = image[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] gray_image = cv2.imread(fname,0) zoomed_gray = gray_image[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] # temp = cv2.imread(fname,0) # temp = temp[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] # plt.imshow(temp) # plt.show() s = original.shape flat_image = original.reshape((s[0]*s[1],3)) pca = PCA(n_components=1) X_r = pca.fit_transform(flat_image) print('explained variance ratio: %s' % str(pca.explained_variance_ratio_)) X_negative = X_r<0 X_r[X_negative] = 0 image = X_r.reshape((s[0],s[1])) template = np.zeros(image_shape[:2],np.uint8) template[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] = image cv2.drawContours(template,horizontal_grid,-1,0,-1) cv2.drawContours(template,vertical_grid,-1,0,-1) zoomed_image = template[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] y_pts,x_pts = np.where(zoomed_image>0) X = np.asarray(zip(x_pts,y_pts)) print("doing dbscan: " + str(X.shape)) db = DBSCAN(eps=1, min_samples=5).fit(X) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) unique_labels = set(labels) colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels))) return_image = np.zeros(zoomed_image.shape,np.uint8) return_image.fill(255) print("going through dbscan results") for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. continue class_member_mask = (labels == k) # temp = np.zeros(X.shape) xy = X[class_member_mask] max_value = zoomed_image[xy[:, 1], xy[:, 0]].max() if max_value >= 110: # print(max_value) # plt.plot(xy[:, 0], xy[:, 1], '.', markerfacecolor=col, # markeredgecolor='k') return_image[xy[:, 1], xy[:, 0]] = zoomed_gray[xy[:, 1], xy[:, 0]] print(np.max(return_image)) cv2.imwrite("/home/ggdhines/testing.jpg",return_image) raw_input("hell world") return return_image def __initial_threshold__(self,fname,region_bounds,horizontal_grid,vertical_grid): """ open fname, "zoom in" on the desired region, apply thresholding to "clean it up" region_bounds = min_x,max_x,min_y,max_y :param fname: :param region: :param mask: :return: """ image = cv2.imread(fname,0) # uncomment if you want to apply ostu thresholding # see http://docs.opencv.org/trunk/d7/d4d/tutorial_py_thresholding.html#gsc.tab=0 _,first_pass = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) cv2.drawContours(first_pass,horizontal_grid,-1,255,-1) cv2.drawContours(first_pass,vertical_grid,-1,255,-1) first_pass = self.__image_clean__(first_pass) # zoom in first_pass = first_pass[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] # second_pass = cv2.adaptiveThreshold(image,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,301,2) # cv2.drawContours(second_pass,horizontal_grid,-1,255,-1) # cv2.drawContours(second_pass,vertical_grid,-1,255,-1) # second_pass = self.__image_clean__(second_pass) # # zoom in # second_pass = second_pass[region_bounds[2]:region_bounds[3]+1,region_bounds[0]:region_bounds[1]+1] return first_pass def __image_clean__(self,image): """ after removing grid lines and applying thresholding, we will probably still have small "ticks" - bits of the grid line which weren't removed but can still cause problems for Tesseract (and probably other approaches too) """ _,contours, hier = cv2.findContours(image.copy(),cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE) # contours are probably in sorted order but just to be sure for cnt in contours: x,y,w,h = cv2.boundingRect(cnt) perimeter = cv2.arcLength(cnt,True) if (h <= 7) or (w <= 7) or (perimeter <= 30): cv2.drawContours(image,[cnt],0,255,-1) return image def __read_box__(self): image = cv2.imread("/home/ggdhines/step4.jpg") s = image.shape with open("/home/ggdhines/boxout","r") as csvfile: spamreader = csv.reader(csvfile, delimiter=' ') for row in spamreader: _,x1,y1,x2,y2,_ = row cv2.rectangle(image,(int(x1),s[0]-int(y1)),(int(x2),s[0]-int(y2)),(255,0,0),2) image = cv2.imwrite("/home/ggdhines/step5.jpg",image) if __name__ == "__main__": project = ActiveWeather() project.__directory_to_subjects__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/") # project.__image_threshold__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0720.JPG") # project.__extract_table__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0720.JPG") # project.__extract_column__(8) # project.__process_image__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0720.JPG") # for i in range(10): # project.__process_row__(i) # project.__process_image__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0720.JPG") # project.__remove_template__("/home/ggdhines/Databases/old_weather/aligned_images/Bear/1940/Bear-AG-29-1940-0720.JPG")\ # project.__read_box__()