import os import sys import math import datetime import logging from logging.config import fileConfig import argparse from keras import backend as K import multiprocessing as mp import threading as thr """Import our own modules""" sys.path.append("modules") from modules.augment import * from modules.mean_IoU import * from modules.image_io import * from modules.image_resize import * from modules.image_processing import * from modules.submission import * from modules.prediction import * from modules.unet import * import modules.image_mosaic as mosaic import modules.image_processing as impr class NucleiUtility(object): def __init__(self): self.OUTPUT_PATH = 'output_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) self.MODELS_GREY_PATH = 'models_grey' self.MODELS_COLOUR_PATH = 'models_colour' self.LOAD_MODELS_GREY_PATH = '' self.LOAD_MODELS_COLOUR_PATH = '' self.PREDICT_TRAIN_GREY_PATH = 'predict/train/grey' self.PREDICT_TRAIN_COLOUR_PATH = 'predict/train/colour' self.PREDICT_VAL_GREY_PATH = 'predict/val/grey' self.PREDICT_VAL_COLOUR_PATH = 'predict/val/colour' self.PREDICT_TEST_GREY_PATH = 'predict/test/grey' self.PREDICT_TEST_COLOUR_PATH = 'predict/test/colour' self.IMG_WIDTH = 256 self.IMG_HEIGHT = 256 """============Tunable parameters from the command line===========""" self.TRAIN_PATH = '../data/stage1_train/' self.TEST_PATH = '../data/stage1_test/' self.RETRAIN = False self.GPU = 1 self.EPOCH = 1 self.KFOLD = -1 self.VALSPLIT = 0 self.EARLYSTOPPING = 5 self.WEIGHTS = [] self.GREY_ONLY = False self.COLOUR_ONLY = False self.PREDICT_TEST_IMAGES_ONLY = False self.VISUALIZE = False self.MOSAIC = False ### Hyper parameters ### self.DROPOUT = 0.1 ### Data augmentation parameters: sigma, scale, alpha ### self.MAX_TRAIN_SIZE = 150000 self.ROTATE_IMAGES = True self.INVERT_IMAGES = False self.PYRAMID_SCALE = -1 self.BLUR_SIGMA = -1 self.TRANSFORM_SIGMA = -1 self.NOISE_SCALE = -1 self.GREYSCALE_ALPHA = -1 """ Indices of grey/colour clusters, these will be set automatically after the pipeline detects the colour of images in the cluster """ self.GREY_IX = 0 self.COLOUR_IX = 1 """=================================================================""" try: fileConfig('logging.conf') except: pass self.logger = logging.getLogger(__name__) self.logger.debug('================================Nuclei START==================================') def parse_argument(self, arguments=None): """Parse the arguments from the command line""" is_parser_error = False parser = argparse.ArgumentParser(description='This tool helps to train and predict nuclei segmentation.', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('action', type=str, choices=['train', 'predict', 'loadpredict'], help='train - training the model\n'+ 'predict - predict test data\n'+ 'loadpredict - load predictions\n' + 'if --trainpath or --testpath or --predictpath is specified, we use the path\n') parser.add_argument('--gpu', nargs=1, metavar='<number of GPUs>', help='the number of GPUs to train, predict and decide the batch size.' + 'If not specified, GPU is 1 by default.') parser.add_argument('--trainpath', nargs=1, metavar='<training data set path>', help='Specify the file path of the training data set') parser.add_argument('--testpath', nargs=1, metavar='<test data set path>', help='Specify the file path of the test data set') parser.add_argument('--loadmodel',nargs=1, metavar='<path of pretrained models>', help='Specify the path of pretrained models') parser.add_argument('--predictpath',nargs=1, metavar='<path of all predictions>', help='Specify the path of predictions on training / testing images') parser.add_argument('--weights', nargs='*', dest='weights', help='Specify the weights of all loaded models') parser.add_argument('--kfold', type=int, nargs=1, metavar='<k-fold of training model>', help='k: the number of splitting the training data set to k-fold models.' + 'If not specified, K is 2 by default.') parser.add_argument('--valsplit', type=float, nargs=1, metavar='<validation set split of a single training model>', help='The given percentage split from the training set used for validation set.' + 'If not specified, K is 2 by default.') parser.add_argument('--epoch', type=int, nargs=1, metavar='<epochs of training>', help='How many epochs we want to train for the k-fold model.' + 'If not specified, EPOCH=1 by default.') parser.add_argument('--earlystopping', type=int, nargs=1, metavar='<epochs of non-decreasing validation loss>', help='After how many epochs do we want to stop trainning after the validation loss has stopped decreasing.' + 'If not specified, EARLYSTOPPING=5 by default.') parser.add_argument('--greyonly', action="store_true", help='Set: only grey models will be trained and does predictions on grey images') parser.add_argument('--colouronly', action="store_true", help='Set: only colour models will be trained and does predictions on colour images') parser.add_argument('--maxtrainsize', type=int, nargs=1, metavar='<approximate maximum training images>', help='Augmentation can significantly increase the number of training images. This parameter ' + 'allows you to limit the total training images.') parser.add_argument('--dropout', type=float, nargs=1, metavar='<minimum dropout rate, such as 0.1>', help='the minimum dropout rate for the model') parser.add_argument('--rotate', action="store_true", help='Set: training data will be augmented by rotating\n'+ 'Not set: training data will not be augmented by rotating') parser.add_argument('--scale', type=float, nargs=1, metavar='<scale, such as 1.5>', help='Training data will be scaled using the given scale based on the average nuclei size') parser.add_argument('--blur', type=float, nargs=1, metavar='<sigma, such as 1>', help='Training data will be blurred using the given sigma') parser.add_argument('--transform', type=float, nargs=1, metavar='<sigma, such as 0.1>', help='Training data will be perspectively transformed using the given sigma') parser.add_argument('--noise', type=float, nargs=1, metavar='<scale, such as 0.05>', help='Add noise to the training data using the given scale') parser.add_argument('--greyscale', type=float, nargs=1, metavar='<alpha, such as 1>', help='Add greyscale images to the training data using the given alpha') parser.add_argument('--invert', action="store_true", help='Set: add inverted imaged to the training') parser.add_argument('--visualize', action="store_true", help='Set: visualizing and ploting desired images') parser.add_argument('--mosaic', action="store_true", help='Set: Try to form mosaics from input images') parser.add_argument('--predicttestonly', action="store_true", help='Set: only predict on test data set') try: if arguments is None: args = parser.parse_args() else: args = parser.parse_args(arguments) if args.trainpath is not None: if not os.path.isdir(args.trainpath[0]): is_parser_error = True self.logger.error('The training data path \'' + args.trainpath[0] + '\' you specified does not exist\n') else: self.TRAIN_PATH = args.trainpath[0] if args.testpath is not None: if not os.path.isdir(args.testpath[0]): is_parser_error = True self.logger.error('The test data path \'' + args.testpath[0] + '\' you specified does not exist\n') else: self.TEST_PATH = args.testpath[0] if args.loadmodel is not None: if not os.path.isdir(args.loadmodel[0]): is_parser_error = True self.logger.error('The loading model path \'' + args.loadmodel[0] + '\' you specified does not exist\n') else: self.LOAD_MODELS_COLOUR_PATH = os.path.join(args.loadmodel[0], self.MODELS_COLOUR_PATH) self.LOAD_MODELS_GREY_PATH = os.path.join(args.loadmodel[0], self.MODELS_GREY_PATH) if args.weights is not None: for arg in args.weights: self.WEIGHTS.append(float(arg)) if args.predictpath is not None: if not os.path.isdir(args.predictpath[0]): is_parser_error = True self.logger.error('The prediction path \'' + args.predictpath[0] + '\' you specified does not exist\n') else: self.logger.debug('The prediction path is %s', args.predictpath[0]) self.PREDICT_TRAIN_GREY_PATH = os.path.join(args.predictpath[0], self.PREDICT_TRAIN_GREY_PATH) self.PREDICT_TRAIN_COLOUR_PATH = os.path.join(args.predictpath[0], self.PREDICT_TRAIN_COLOUR_PATH) self.PREDICT_VAL_GREY_PATH = os.path.join(args.predictpath[0], self.PREDICT_VAL_GREY_PATH) self.PREDICT_VAL_COLOUR_PATH = os.path.join(args.predictpath[0], self.PREDICT_VAL_COLOUR_PATH) self.PREDICT_TEST_GREY_PATH = os.path.join(args.predictpath[0], self.PREDICT_TEST_GREY_PATH) self.PREDICT_TEST_COLOUR_PATH = os.path.join(args.predictpath[0], self.PREDICT_TEST_COLOUR_PATH) if args.gpu is not None: self.GPU = args.gpu[0] if args.epoch is not None: self.EPOCH = args.epoch[0] if args.kfold is not None: self.KFOLD = args.kfold[0] if args.valsplit is not None: self.VALSPLIT = args.valsplit[0] if args.earlystopping is not None: self.EARLYSTOPPING = args.earlystopping[0] if args.dropout is not None: self.DROPOUT = args.dropout[0] if args.maxtrainsize is not None: self.MAX_TRAIN_SIZE = args.maxtrainsize[0] if args.scale is not None: self.PYRAMID_SCALE = args.scale[0] if args.blur is not None: self.BLUR_SIGMA = args.blur[0] if args.transform is not None: self.TRANSFORM_SIGMA = args.transform[0] if args.noise is not None: self.NOISE_SCALE = args.noise[0] if args.greyscale is not None: self.GREYSCALE_ALPHA = args.greyscale[0] self.ROTATE_IMAGES = args.rotate self.INVERT_IMAGES = args.invert self.VISUALIZE = args.visualize self.MOSAIC = args.mosaic self.PREDICT_TEST_IMAGES_ONLY = args.predicttestonly self.GREY_ONLY = args.greyonly self.COLOUR_ONLY = args.colouronly self.logger.debug(args) except: is_parser_error = True if is_parser_error is True: parser.print_help() sys.exit(1) else: return args def test_gpu(self): found_gpus = K.tensorflow_backend._get_available_gpus() self.logger.info("Found GPUs: " + str(found_gpus)) def load_train_data(self, num_cluster=2): self.logger.info('Loading train images as a data frame from %s', self.TRAIN_PATH) img_df = read_images_as_dataframe(self.TRAIN_PATH) self.logger.info('Loading train masks and contours to the data frame and save the masks/contours on the disk') read_masks_to_dataframe(self.TRAIN_PATH, img_df) img_df = create_color_features(img_df) img_df, cluster_maker = create_color_clusters(img_df, num_cluster) self.logger.debug('Original training data set size: %d', len(img_df)) return img_df, cluster_maker def load_test_data(self, cluster_maker, num_cluster = 2): self.logger.info('Loading test images as a data frame from %s', self.TEST_PATH) img_df = read_images_as_dataframe(self.TEST_PATH) img_df = create_color_features(img_df) img_df, _ = create_color_clusters(img_df, num_cluster, cluster_maker) self.logger.debug('Original test data set size: %d', len(img_df)) return img_df def set_cluster_index(self, cluster): count = 0 for _, row in cluster[0].iterrows(): if row['Red'] == row['Green'] and row['Green'] == row['Blue']: count = count+1 if count > float(len(cluster[0].index)/2): self.logger.info('After K-Means: set the cluster 0 to GREY and the cluster 1 to COLOUR ') self.GREY_IX = 0 self.COLOUR_IX = 1 else: self.logger.info('After K-Means: set the cluster 0 to COLOUR and the cluster 1 to GREY ') self.GREY_IX = 1 self.COLOUR_IX = 0 def preprocess_images(self, train_img_df, test_img_df, num_clusters = 2): process_images(train_img_df) process_images(test_img_df) cluster_train_df_list = split_cluster_to_group(train_img_df, num_clusters) cluster_test_df_list = split_cluster_to_group(test_img_df, num_clusters) self.set_cluster_index(cluster_train_df_list) self.logger.debug('Grey train images size: %d', len(cluster_train_df_list[self.GREY_IX])) self.logger.debug('Colour train images size: %d', len(cluster_train_df_list[self.COLOUR_IX])) self.logger.debug(cluster_train_df_list[0].sample()) self.logger.debug(cluster_test_df_list[0].sample()) return cluster_train_df_list, cluster_test_df_list def augment_training_inputs(self, X_train, Y_train, X_train_3channel): augmented_X_train = X_train.copy() augmented_Y_train = Y_train.copy() if self.PYRAMID_SCALE > 0: self.logger.info('Scale train images based on the nuclei size') nuclei_sizes = get_mean_cell_size(Y_train) self.logger.info('Median nuclei size of train images: %d', np.median(nuclei_sizes)) augmented_X_train.extend(scale_images_on_nuclei_size(X_train, nuclei_sizes, self.PYRAMID_SCALE, self.IMG_HEIGHT/2, self.IMG_HEIGHT*2)) augmented_Y_train.extend(scale_images_on_nuclei_size(Y_train, nuclei_sizes, self.PYRAMID_SCALE, self.IMG_HEIGHT/2, self.IMG_HEIGHT*2)) if self.TRANSFORM_SIGMA > 0: self.logger.info('Perspectively transform train images') sequence = get_perspective_transform_sequence(self.TRANSFORM_SIGMA) augmented_X_train.extend(perspective_transform(X_train, sequence)) augmented_Y_train.extend(perspective_transform(Y_train, sequence)) if len(X_train_3channel) > 0: self.logger.info('Perspectively transform train images a second time on colour images') sequence = get_perspective_transform_sequence(self.TRANSFORM_SIGMA - 0.005) augmented_X_train.extend(perspective_transform(X_train, sequence)) augmented_Y_train.extend(perspective_transform(Y_train, sequence)) if self.NOISE_SCALE > 0: self.logger.info('Add additive Gaussian noise and speckle noise to train images') augmented_X_train.extend(additive_Gaussian_noise(X_train, self.NOISE_SCALE)) augmented_X_train.extend(speckle_noise(X_train)) self.logger.debug('We do not add noise to mask data, just add the original mask data') augmented_Y_train.extend(Y_train) augmented_Y_train.extend(Y_train) if self.GREYSCALE_ALPHA > 0 and len(X_train_3channel) > 0: self.logger.info('Convert images to greyscale on original RGB images') augmented_X_train.extend(greyscale(X_train_3channel, self.GREYSCALE_ALPHA)) self.logger.debug('We do not greysacle on mask data, just add the original mask data') augmented_Y_train.extend(Y_train) if self.INVERT_IMAGES: self.logger.info('Invert images on original 3-channel images') augmented_X_train.extend(invert(X_train)) self.logger.debug('We do not invert mask data, just add the original mask data') augmented_Y_train.extend(Y_train) if self.BLUR_SIGMA > 0: self.logger.info('Blurring train images') augmented_X_train.extend(blur(X_train, self.BLUR_SIGMA)) self.logger.debug('We do not blur on mask data, just add the original mask data') augmented_Y_train.extend(Y_train) # Best results where when the output from windowing was rotated, instead of re-sizing here # and only rotating on some data. So, rotation is done afterwards. #if self.ROTATE_IMAGES: # self.logger.info('Rotate and mirror train images') # augmented_X_train.extend( augment_max ( np.asarray( resize_images(X_train, self.IMG_HEIGHT) ) ) ) # augmented_Y_train.extend( augment_max ( np.asarray( resize_images(Y_train, self.IMG_HEIGHT) ) ) ) self.logger.info('Augmentation rate is %d', len(augmented_X_train)/len(X_train)) return (augmented_X_train, augmented_Y_train) def build_model_training_inputs(self, cluster_df, cluster_ix): X_train_3channel = [] all_proc_images = cluster_df['image_process'].values.tolist() num_orig_images = len(all_proc_images) all_masks = cluster_df['mask_train'].values.tolist() if self.MOSAIC: self.logger.info('Forming mosaics from %d input images...', len(all_proc_images)) # Mosaics are formed on the raw image, since individual parts of a mosaic may have # been altered differently during pre-processing. mosaic_images, _, mosaic_dict, not_combined = mosaic.make_mosaic(cluster_df['image'].values.tolist(), None) self.logger.info('Found %d 4x4 image mosaics, %d images could not be combined into mosaics.', len(mosaic_images), len(not_combined)) self.logger.debug('Mosaic dictionary: %s', mosaic_dict) self.logger.debug('Images that could not be combined into mosaics: %s', str(not_combined)) # Augmentation needs the original 3-channel colour image in some cases too if cluster_ix == self.COLOUR_IX: (X_train_3channel, _, _) = mosaic.merge_mosaic_images(mosaic_dict, mosaic_images, cluster_df['image'].values.tolist()) mosaic_images = [impr.preprocess_image(x) for x in mosaic_images] (X_train, _, Y_train) = mosaic.merge_mosaic_images(mosaic_dict, mosaic_images, all_proc_images, all_masks) self.logger.info('Total of %d images after mosaic processing.', len(X_train)) mosaic_images = None else: X_train = all_proc_images Y_train = all_masks if cluster_ix == self.COLOUR_IX: X_train_3channel = cluster_df['image'].values.tolist() self.logger.info('%d images of the original training data', len(X_train)) if len(X_train) > 0: (X_train, Y_train) = self.augment_training_inputs(X_train, Y_train, X_train_3channel) X_train_3channel = None self.logger.info('Windowing on training data') X_train = window_images(X_train, self.IMG_HEIGHT, self.IMG_WIDTH) Y_train = window_images(Y_train, self.IMG_HEIGHT, self.IMG_WIDTH) self.logger.info('%d images to the training data after windowing', X_train.shape[0]) # Rotations/flips moved here instead of in the main augmentation loop, to ensure all # augmented samples are also mirrored/flipped. if self.ROTATE_IMAGES and len(X_train) > 0: self.logger.info('Rotate and mirror train images') rotate_amplify_rate = 8 num_windows = X_train.shape[0] estimated_images = num_windows * rotate_amplify_rate if estimated_images > self.MAX_TRAIN_SIZE: max_windows_to_rotate = int(self.MAX_TRAIN_SIZE/rotate_amplify_rate) self.logger.info('Only rotating the first %d windows to reduce training size.', max_windows_to_rotate) augment_half_X = augment_max(X_train[0:max_windows_to_rotate]) X_train = np.concatenate((augment_half_X, X_train[max_windows_to_rotate:]), axis=0) augment_half_X = None augment_half_Y = augment_max(Y_train[0:max_windows_to_rotate]) Y_train = np.concatenate((augment_half_Y, Y_train[max_windows_to_rotate:]), axis=0) augment_half_Y = None else: X_train = augment_max(X_train) Y_train = augment_max(Y_train) self.logger.info('%d images to the training data after rotations/flips', X_train.shape[0]) if len(X_train) > 0: self.logger.info('Final augmentation rate is %d', int(X_train.shape[0]/num_orig_images)) return (X_train, Y_train) def build_model_prediction_inputs(self, images): inputs = window_images(images, self.IMG_HEIGHT, self.IMG_WIDTH) inputs = augment_max(inputs) self.logger.info('%d images in prediction data after windowing/rotation/flip', len(inputs)) return inputs def train_model(self, train_df, val_df, cluster_ix, save_model_path, load_model_path=None, model_type='grey'): self.logger.info('### Build X_train/Y_train %s: images ####', model_type) (X_train, Y_train) = self.build_model_training_inputs(train_df, cluster_ix) if self.KFOLD > 0: if os.path.isdir(load_model_path): self.logger.info('Retrain the %s models under %s and save the new models under %s', model_type, load_model_path, save_model_path) models, _ = train_model_kfold(X_train, Y_train, self.KFOLD, self.EPOCH, self.GPU, self.EARLYSTOPPING, load_model_path, save_model_path, min_dropout=self.DROPOUT) else: self.logger.info("Train the new %s models from scratch and save the new models under %s", model_type, save_model_path) models, _ = train_model_kfold(X_train, Y_train, self.KFOLD, self.EPOCH, self.GPU, self.EARLYSTOPPING, save_model_path=save_model_path, min_dropout=self.DROPOUT) else: self.logger.info('### Build X_val/Y_val %s: images ####', model_type) (X_val, Y_val) = self.build_model_training_inputs(val_df, cluster_ix) if os.path.isdir(load_model_path): self.logger.info('Retrain the single %s model under %s and save the new model under %s', model_type, load_model_path, save_model_path) model, _ = train_model(X_train, Y_train, X_val, Y_val, self.EPOCH, self.GPU, self.EARLYSTOPPING, load_model_path, save_model_path, min_dropout=self.DROPOUT) else: self.logger.info("Train the new %s model from scratch and save the new model under %s", model_type, save_model_path) model, _ = train_model(X_train, Y_train, X_val, Y_val, self.EPOCH, self.GPU, self.EARLYSTOPPING, save_model_path=save_model_path, min_dropout=self.DROPOUT) models = [model] return models def build_train_validation_df_list(self, cluster_train_df_list): self.logger.info('Remove outliers ONLY from the training data fed to the model') no_outliers_train_df_list = [] no_outliers_train_df_list.append(return_images_without_outliers(cluster_train_df_list[0])) no_outliers_train_df_list.append(return_images_without_outliers(cluster_train_df_list[1])) self.logger.debug('Grey train images size without outliers: %d', len(no_outliers_train_df_list[self.GREY_IX])) self.logger.debug('Colour Train images size without outliers: %d', len(no_outliers_train_df_list[self.COLOUR_IX])) train_df_list = [] val_df_list = [] train_df_0, val_df_0 = split_train_val_set(no_outliers_train_df_list[0], self.VALSPLIT) train_df_1, val_df_1 = split_train_val_set(no_outliers_train_df_list[1], self.VALSPLIT) train_df_list.append(train_df_0) val_df_list.append(val_df_0) train_df_list.append(train_df_1) val_df_list.append(val_df_1) self.logger.debug('sorted validation 0 imageID:\n%s', str(np.sort(val_df_0['imageID'].values))) self.logger.debug('sorted validation 1 imageID:\n%s', str(np.sort(val_df_1['imageID'].values))) return train_df_list, val_df_list def train(self, train_df_list, val_df_list): save_model_grey_path = os.path.join(self.OUTPUT_PATH, self.MODELS_GREY_PATH) save_model_colour_path = os.path.join(self.OUTPUT_PATH, self.MODELS_COLOUR_PATH) grey_models = [] colour_models = [] if not self.COLOUR_ONLY: grey_models = self.train_model(train_df_list[self.GREY_IX], val_df_list[self.GREY_IX], self.GREY_IX, save_model_grey_path, self.LOAD_MODELS_GREY_PATH, 'grey') if not self.GREY_ONLY: colour_models = self.train_model(train_df_list[self.COLOUR_IX], val_df_list[self.COLOUR_IX], self.COLOUR_IX, save_model_colour_path, self.LOAD_MODELS_COLOUR_PATH, 'colour') return grey_models, colour_models def batched_predictions(self, img_df, models, input_type): # Iterator to chunk images/sizes into batches. def image_batches(images, sizes, batch_size): for i in range(0, len(images), batch_size): yield (images[i:i + batch_size], sizes[i:i + batch_size]) predictions_full_size = [] all_images = img_df['image_process'].values.tolist() if self.MOSAIC: self.logger.info('Forming mosaics from %d input %s images...', len(all_images), input_type) # Mosaics are formed on the raw image, since individual parts of a mosaic may have # been altered differently during pre-processing. mosaic_images, _, mosaic_dict, not_combined = mosaic.make_mosaic(img_df['image'].values.tolist(), None) mosaic_images = [impr.preprocess_image(x) for x in mosaic_images] self.logger.info('Found %d 4x4 image mosaics, %d images could not be combined into mosaics.', len(mosaic_images), len(not_combined)) self.logger.debug('Mosaic dictionary: %s', mosaic_dict) self.logger.debug('Images that could not be combined into mosaics: %s', str(not_combined)) # Any images not included in the mosaic images should be from the list of pre-processed images (all_images, all_sizes, _) = mosaic.merge_mosaic_images(mosaic_dict, mosaic_images, all_images) self.logger.info('Total of %d images after mosaic processing.', len(all_images)) mosaic_images = None else: all_sizes = img_df['size'].values # Split the total set of images into smaller batches. For large datasets (i.e. after applying # windowing and rotation), trying to do all images at once encountered a Python MemoryError. batch_size = 100 self.logger.info('Predict on %s images in batches of up to %d original images...', input_type, batch_size) for (batch, sizes) in image_batches(all_images, all_sizes, batch_size): predict_inputs = self.build_model_prediction_inputs(batch) predictions = average_model_predictions(predict_inputs, models, self.WEIGHTS) predictions_full_size.extend(predict_restore_to_fullsize(predictions, sizes)) del predictions if self.MOSAIC: self.logger.info('Re-forming full-size images from mosaics.') input_len = len(img_df['image_process'].values.tolist()) predictions_full_size = mosaic.split_merged_mosaic(mosaic_dict, predictions_full_size, input_len) return predictions_full_size def predict_model_for_dataframe(self, img_df, models, predict_path, input_type, colour_type): self.logger.info('Performing batched %s predictions for %s images...', input_type, colour_type) predictions_full_size = self.batched_predictions(img_df, models, input_type) img_df['prediction'] = pd.Series(predictions_full_size).values predictions_full_size = None self.logger.info('Save predictions of %s %s images under %s', input_type, colour_type, os.path.join(self.OUTPUT_PATH, predict_path)) save_prediction_images(img_df, os.path.join(self.OUTPUT_PATH, predict_path)) self.post_process_predictions(img_df, input_type, colour_type) def predict_model(self, grey_models, colour_models, cluster_df_list, input_type='train'): if input_type == 'train': predict_grey_path = self.PREDICT_TRAIN_GREY_PATH predict_colour_path = self.PREDICT_TRAIN_COLOUR_PATH elif input_type == 'val': predict_grey_path = self.PREDICT_VAL_GREY_PATH predict_colour_path = self.PREDICT_VAL_COLOUR_PATH elif input_type == 'test': predict_grey_path = self.PREDICT_TEST_GREY_PATH predict_colour_path = self.PREDICT_TEST_COLOUR_PATH if grey_models: self.predict_model_for_dataframe(cluster_df_list[self.GREY_IX], grey_models, predict_grey_path, input_type, 'grey') if colour_models: self.predict_model_for_dataframe(cluster_df_list[self.COLOUR_IX], colour_models, predict_colour_path, input_type, 'colour') def predict(self, cluster_train_df_list, cluster_val_df_list, cluster_test_df_list, grey_models=None, colour_models=None): if grey_models is None and self.LOAD_MODELS_GREY_PATH: grey_models = load_models(self.LOAD_MODELS_GREY_PATH, self.GPU) if colour_models is None and self.LOAD_MODELS_COLOUR_PATH: colour_models = load_models(self.LOAD_MODELS_COLOUR_PATH, self.GPU) if not self.PREDICT_TEST_IMAGES_ONLY: self.predict_model(grey_models, colour_models, cluster_train_df_list, 'train') if not cluster_val_df_list[self.GREY_IX].empty and not cluster_val_df_list[self.COLOUR_IX].empty: self.predict_model(grey_models, colour_models, cluster_val_df_list, 'val') self.predict_model(grey_models, colour_models, cluster_test_df_list, 'test') self.logger.debug('Sample the final test data frame after prediction to make sure everything is good') self.logger.debug(cluster_test_df_list[self.GREY_IX].sample()) self.logger.debug(cluster_test_df_list[self.COLOUR_IX].sample()) def do_post_processing_batch(self, all_images, all_predictions): # Determine the ideal threading parallelism - one per CPU is good for now. num_threads = mp.cpu_count() self.logger.debug('Detected %d CPUs, creating %d worker threads for prediction post-processing', num_threads, num_threads) result = [None]*num_threads def parallel_thread(images, predictions, result, index): # Keep everything in try/catch loop so we handle errors self.logger.debug('Post-processing worker %d starting processing', index) result[index] = [impr.post_process_image(img, prediction[:,:,0], prediction[:,:,1]) for img, prediction in zip(images, predictions)] self.logger.debug('Post-processing worker %d finished processing', index) return True threads = [] images_len = len(all_images) images_per_thread = int(images_len / num_threads) for ii in range(num_threads): if ii < (num_threads - 1): images = all_images[ii * images_per_thread: (ii + 1) * images_per_thread] predictions = all_predictions[ii * images_per_thread: (ii + 1) * images_per_thread] else: images = all_images[ii * images_per_thread:] predictions = all_predictions[ii * images_per_thread:] thread = thr.Thread(target=parallel_thread, args=[images, predictions, result, ii]) thread.start() threads.append(thread) # We now pause execution on the main thread by 'joining' all of our started threads. for thread in threads: thread.join() # Now merge all results back together in the right order labels = [] for ii in range(num_threads): labels.extend(result[ii]) return labels def do_post_processing(self, all_images, all_predictions): # Iterator to chunk images/predictions into batches. def postprocess_batches(images, predictions, batch_size): for i in range(0, len(images), batch_size): yield (images[i:i + batch_size], predictions[i:i + batch_size]) labels = [] # Split images/predictions up to batches of 100 at a time. Each batch is then processed # in parallel by multiple threads. batch_size = 100 for (images, predictions) in postprocess_batches(all_images, all_predictions, batch_size): self.logger.info('Post-process batch of %d images...', len(images)) labels.extend(self.do_post_processing_batch(images, predictions)) return labels def post_process_predictions(self, img_df, input_type, colour_type): self.logger.info('Post processing predicted %s %s images and generate labels for each image', input_type, colour_type) if self.MOSAIC: # Mosaic prediction steps: # 1. Make the list of mosaic images # 2. Merge the mosaic images into the list of all images (needed for post-processing), and at the # same time, update the predictions so they match the mosaic images if needed. # 3. Perform post-processing to get the output labels (one label for each mosaic image) # 4. Split the mosaic labels to match the original images # 5. Re-label the split labels, to ensure all nuclei in a single label image are consecutive all_images = img_df['image_process'].values.tolist() all_predictions = img_df['prediction'].values.tolist() input_len = len(all_images) mosaic_images, _, mosaic_dict, _ = mosaic.make_mosaic(img_df['image'].values.tolist(), None) mosaic_images = [impr.preprocess_image(x) for x in mosaic_images] (all_images, _, all_predictions) = mosaic.merge_mosaic_images(mosaic_dict, mosaic_images, all_images, all_predictions) # Change post-processing to multi-threaded. No change in prediction logic, just with the # much larger number of test images, we split them up into groups of 100 at a time, and # then split each group of 100 images up to be handled by separate threads. #labels = [impr.post_process_image(img, prediction[:,:,0], prediction[:,:,1]) for img, prediction in zip(all_images, all_predictions)] labels = self.do_post_processing(all_images, all_predictions) labels = mosaic.split_merged_mosaic(mosaic_dict, labels, input_len) labels = [renumber_labels(label_img) for label_img in labels] img_df['label'] = pd.Series(labels).values else: add_labels_to_dataframe(img_df) def load_predictions_by_group(self, cluster_df_list, grey_path, colour_path, input_type): if os.path.isdir(grey_path): self.logger.info('Load predictions of %s grey images...', input_type) load_prediction_images_to_df(cluster_df_list[self.GREY_IX], grey_path) self.post_process_predictions(cluster_df_list[self.GREY_IX], input_type, 'grey') if os.path.isdir(colour_path): self.logger.info('Load predictions of %s colour images...', input_type) load_prediction_images_to_df(cluster_df_list[self.COLOUR_IX], colour_path) self.post_process_predictions(cluster_df_list[self.COLOUR_IX], input_type, 'colour') def load_predictions(self, cluster_train_df_list, cluster_val_df_list, cluster_test_df_list): self.load_predictions_by_group(cluster_train_df_list, self.PREDICT_TRAIN_GREY_PATH, self.PREDICT_TRAIN_COLOUR_PATH, 'training') self.load_predictions_by_group(cluster_val_df_list, self.PREDICT_VAL_GREY_PATH, self.PREDICT_VAL_COLOUR_PATH, 'validation') self.load_predictions_by_group(cluster_test_df_list, self.PREDICT_TEST_GREY_PATH, self.PREDICT_TEST_COLOUR_PATH, 'test') def create_submission(self, cluster_test_df_list): if 'label' in cluster_test_df_list[self.GREY_IX] and 'label' in cluster_test_df_list[self.COLOUR_IX]: submission_file = os.path.join(self.OUTPUT_PATH, str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")) + '.csv') self.logger.info('Generate submission file') dfs = [] dfs.append(cluster_test_df_list[self.GREY_IX]) dfs.append(cluster_test_df_list[self.COLOUR_IX]) generate_submission_from_df(dfs, submission_file) self.logger.info("##### File ready for submission: %s ######", submission_file) else: self.logger.info('No submission file will be generated because not all images have been predicted.') def display_mean_IoU(self, df): self.logger.info('Mean IoU: ' + str(df['mean_IoU'].mean())) self.logger.info('Sorted mean IoUs for 100-image groups') self.logger.info(df.sort_values(by=['mean_IoU'])['mean_IoU'].groupby(np.arange(len(df))//100).mean()) def generate_meanIoU_report(self, cluster_split_train_df_list, cluster_val_df_list): self.logger.info('Generate ground truths for mean IoU on training and validation data....') Y_true = create_labeled_masks(self.TRAIN_PATH) if 'label' in cluster_split_train_df_list[self.GREY_IX]: self.logger.info('Calculate mean IoU on grey training data:') cluster_split_train_df_list[self.GREY_IX] = add_metrics_to_df(cluster_split_train_df_list[self.GREY_IX], Y_true, True) self.display_mean_IoU(cluster_split_train_df_list[self.GREY_IX]) if 'label' in cluster_split_train_df_list[self.COLOUR_IX]: self.logger.info('Calculate mean IoU on colour training data:') cluster_split_train_df_list[self.COLOUR_IX] = add_metrics_to_df(cluster_split_train_df_list[self.COLOUR_IX], Y_true, True) self.display_mean_IoU(cluster_split_train_df_list[self.COLOUR_IX]) if 'label' in cluster_val_df_list[self.GREY_IX]: self.logger.info('Calculate mean IoU on grey validation data:') cluster_val_df_list[self.GREY_IX] = add_metrics_to_df(cluster_val_df_list[self.GREY_IX], Y_true, True) self.display_mean_IoU(cluster_val_df_list[self.GREY_IX]) if 'label' in cluster_val_df_list[self.COLOUR_IX]: self.logger.info('Calculate mean IoU on colour validation data:') cluster_val_df_list[self.COLOUR_IX] = add_metrics_to_df(cluster_val_df_list[self.COLOUR_IX], Y_true, True) self.display_mean_IoU(cluster_val_df_list[self.COLOUR_IX]) if __name__ == '__main__': util = NucleiUtility() args = util.parse_argument() util.test_gpu() train_img_df, cluster_maker = util.load_train_data() test_img_df = util.load_test_data(cluster_maker) cluster_train_df_list, cluster_test_df_list = util.preprocess_images(train_img_df, test_img_df) cluster_split_train_df_list, cluster_val_df_list = util.build_train_validation_df_list (cluster_train_df_list) if args.action == 'train': grey_models, colour_models = util.train(cluster_split_train_df_list, cluster_val_df_list) util.predict(cluster_split_train_df_list, cluster_val_df_list, cluster_test_df_list, grey_models, colour_models) elif args.action == 'predict': util.predict(cluster_split_train_df_list, cluster_val_df_list, cluster_test_df_list) elif args.action == 'loadpredict': util.load_predictions(cluster_split_train_df_list, cluster_val_df_list, cluster_test_df_list) util.create_submission(cluster_test_df_list) util.generate_meanIoU_report(cluster_split_train_df_list, cluster_val_df_list)